New VDW kernel flavour for LJ-PME and updated group kernels
authorChristian Wennberg <christian.wennberg@scilifelab.se>
Fri, 13 Dec 2013 10:49:56 +0000 (11:49 +0100)
committerGerrit Code Review <gerrit@gerrit.gromacs.org>
Fri, 28 Mar 2014 23:29:51 +0000 (00:29 +0100)
Group-scheme LJ-PME now calculates the same properties as Verlet-scheme

The previous group kernels implemented the non-bonded interactions
as described in J. Chem. Theory Comput., 2013, 9 (8), pp 3527–3537,
These are now updated to subtract the grid contribution in real
space, which corrects for any difference in the reciprocal-space
part from the use of approximate LJ parameters.

Group kernels + LJ-PME now supports modifiers

Added a new interaction type "LJEwald" which is used in the
group scheme for simulations with LJ-PME.

Change-Id: I4c9394381bf813082afe1fdbc168659de2412da6

156 files changed:
src/gromacs/gmxlib/names.c
src/gromacs/gmxlib/nonbonded/nb_generic.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/make_nb_kernel_avx_128_fma_double.py
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_avx_128_fma_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_avx_128_fma_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_avx_128_fma_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_avx_128_fma_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_avx_128_fma_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecEw_VdwLJEw_GeomP1P1_avx_128_fma_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecEw_VdwLJEw_GeomW3P1_avx_128_fma_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecEw_VdwLJEw_GeomW3W3_avx_128_fma_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecEw_VdwLJEw_GeomW4P1_avx_128_fma_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecEw_VdwLJEw_GeomW4W4_avx_128_fma_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_avx_128_fma_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecNone_VdwLJEw_GeomP1P1_avx_128_fma_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_avx_128_fma_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_template_avx_128_fma_double.pre
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/make_nb_kernel_avx_128_fma_single.py
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_avx_128_fma_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_avx_128_fma_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_avx_128_fma_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_avx_128_fma_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_avx_128_fma_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEw_VdwLJEw_GeomP1P1_avx_128_fma_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEw_VdwLJEw_GeomW3P1_avx_128_fma_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEw_VdwLJEw_GeomW3W3_avx_128_fma_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEw_VdwLJEw_GeomW4P1_avx_128_fma_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEw_VdwLJEw_GeomW4W4_avx_128_fma_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_avx_128_fma_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecNone_VdwLJEw_GeomP1P1_avx_128_fma_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_avx_128_fma_single.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_template_avx_128_fma_single.pre
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/make_nb_kernel_avx_256_double.py
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_avx_256_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_avx_256_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_avx_256_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_avx_256_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_avx_256_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwLJEw_GeomP1P1_avx_256_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwLJEw_GeomW3P1_avx_256_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwLJEw_GeomW3W3_avx_256_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwLJEw_GeomW4P1_avx_256_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwLJEw_GeomW4W4_avx_256_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_avx_256_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecNone_VdwLJEw_GeomP1P1_avx_256_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_avx_256_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_template_avx_256_double.pre
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/make_nb_kernel_avx_256_single.py
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_avx_256_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_avx_256_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_avx_256_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_avx_256_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_avx_256_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecEw_VdwLJEw_GeomP1P1_avx_256_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecEw_VdwLJEw_GeomW3P1_avx_256_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecEw_VdwLJEw_GeomW3W3_avx_256_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecEw_VdwLJEw_GeomW4P1_avx_256_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecEw_VdwLJEw_GeomW4W4_avx_256_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_avx_256_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecNone_VdwLJEw_GeomP1P1_avx_256_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_avx_256_single.c
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_template_avx_256_single.pre
src/gromacs/gmxlib/nonbonded/nb_kernel_c/make_nb_kernel_c.py
src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_c.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_c.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_c.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_c.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_c.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecEw_VdwLJEw_GeomP1P1_c.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecEw_VdwLJEw_GeomW3P1_c.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecEw_VdwLJEw_GeomW3W3_c.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecEw_VdwLJEw_GeomW4P1_c.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecEw_VdwLJEw_GeomW4W4_c.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_c.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecNone_VdwLJEw_GeomP1P1_c.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_c.c
src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_template_c.pre
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/make_nb_kernel_sparc64_hpc_ace_double.py
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJEw_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJEw_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJEw_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJEw_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJEw_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecNone_VdwLJEw_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_template_sparc64_hpc_ace_double.pre
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/make_nb_kernel_sse2_double.py
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_sse2_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_sse2_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_sse2_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_sse2_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_sse2_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEw_VdwLJEw_GeomP1P1_sse2_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEw_VdwLJEw_GeomW3P1_sse2_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEw_VdwLJEw_GeomW3W3_sse2_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEw_VdwLJEw_GeomW4P1_sse2_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEw_VdwLJEw_GeomW4W4_sse2_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_sse2_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecNone_VdwLJEw_GeomP1P1_sse2_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_sse2_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_template_sse2_double.pre
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/make_nb_kernel_sse2_single.py
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_sse2_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_sse2_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_sse2_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_sse2_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_sse2_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwLJEw_GeomP1P1_sse2_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwLJEw_GeomW3P1_sse2_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwLJEw_GeomW3W3_sse2_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwLJEw_GeomW4P1_sse2_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwLJEw_GeomW4W4_sse2_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_sse2_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecNone_VdwLJEw_GeomP1P1_sse2_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_sse2_single.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_template_sse2_single.pre
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/make_nb_kernel_sse4_1_double.py
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_sse4_1_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_sse4_1_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_sse4_1_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_sse4_1_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_sse4_1_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecEw_VdwLJEw_GeomP1P1_sse4_1_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecEw_VdwLJEw_GeomW3P1_sse4_1_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecEw_VdwLJEw_GeomW3W3_sse4_1_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecEw_VdwLJEw_GeomW4P1_sse4_1_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecEw_VdwLJEw_GeomW4W4_sse4_1_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_sse4_1_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecNone_VdwLJEw_GeomP1P1_sse4_1_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_sse4_1_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_template_sse4_1_double.pre
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/make_nb_kernel_sse4_1_single.py
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_sse4_1_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_sse4_1_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_sse4_1_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_sse4_1_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_sse4_1_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEw_VdwLJEw_GeomP1P1_sse4_1_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEw_VdwLJEw_GeomW3P1_sse4_1_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEw_VdwLJEw_GeomW3W3_sse4_1_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEw_VdwLJEw_GeomW4P1_sse4_1_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEw_VdwLJEw_GeomW4W4_sse4_1_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_sse4_1_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecNone_VdwLJEw_GeomP1P1_sse4_1_single.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_sse4_1_single.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_template_sse4_1_single.pre
src/gromacs/gmxpreprocess/readir.c
src/gromacs/legacyheaders/types/enums.h
src/gromacs/legacyheaders/types/forcerec.h
src/gromacs/mdlib/forcerec.c

index 42670fe0349fa10a06c687a15015d4da9422df86..69b5f5958b61a71acf1131581a505fec5c4aa38a 100644 (file)
@@ -279,5 +279,5 @@ const char *gmx_nbkernel_elec_names[GMX_NBKERNEL_ELEC_NR+1] =
 
 const char *gmx_nbkernel_vdw_names[GMX_NBKERNEL_VDW_NR+1] =
 {
-    "None", "Lennard-Jones", "Buckingham", "Cubic-Spline-Table", NULL
+    "None", "Lennard-Jones", "Buckingham", "Cubic-Spline-Table", "LJEwald", NULL
 };
index fbf88de132ca457be627262be13f9722ecbaeec9..bc58c7e6bb37a1dac289c9b471d02bfd33005c95 100644 (file)
@@ -76,10 +76,10 @@ gmx_nb_generic_kernel(t_nblist *                nlist,
     real          ix, iy, iz, fix, fiy, fiz;
     real          jx, jy, jz;
     real          dx, dy, dz, rsq, rinv;
-    real          c6, c12, cexp1, cexp2, br;
+    real          c6, c12, c6grid, cexp1, cexp2, br;
     real *        charge;
     real *        shiftvec;
-    real *        vdwparam;
+    real *        vdwparam, *vdwgridparam;
     int *         shift;
     int *         type;
     real *        fshift;
@@ -97,6 +97,7 @@ gmx_nb_generic_kernel(t_nblist *                nlist,
     real          rswitch_elec, rswitch_vdw, d, d2, sw, dsw, rinvcorr;
     real          elec_swV3, elec_swV4, elec_swV5, elec_swF2, elec_swF3, elec_swF4;
     real          vdw_swV3, vdw_swV4, vdw_swV5, vdw_swF2, vdw_swF3, vdw_swF4;
+    real          ewclj, ewclj2, ewclj6, ewcljrsq, poly, exponent, sh_lj_ewald;
     gmx_bool      bExactElecCutoff, bExactVdwCutoff, bExactCutoff;
 
     x                   = xx[0];
@@ -120,6 +121,11 @@ gmx_nb_generic_kernel(t_nblist *                nlist,
     rvdw2               = rvdw*rvdw;
     sh_dispersion       = fr->ic->dispersion_shift.cpot;
     sh_repulsion        = fr->ic->repulsion_shift.cpot;
+    sh_lj_ewald         = fr->ic->sh_lj_ewald;
+
+    ewclj               = fr->ewaldcoeff_lj;
+    ewclj2              = ewclj*ewclj;
+    ewclj6              = ewclj2*ewclj2*ewclj2;
 
     if (fr->coulomb_modifier == eintmodPOTSWITCH)
     {
@@ -172,7 +178,7 @@ gmx_nb_generic_kernel(t_nblist *                nlist,
     eps                 = 0.0;
     eps2                = 0.0;
 
-    /* 3 VdW parameters for buckingham, otherwise 2 */
+    /* 3 VdW parameters for Buckingham, otherwise 2 */
     nvdwparam           = (ivdw == GMX_NBKERNEL_VDW_BUCKINGHAM) ? 3 : 2;
     table_nelements     = 12;
 
@@ -182,6 +188,7 @@ gmx_nb_generic_kernel(t_nblist *                nlist,
     shiftvec            = fr->shift_vec[0];
     vdwparam            = fr->nbfp;
     ntype               = fr->ntype;
+    vdwgridparam        = fr->ljpme_c6grid;
 
     for (n = 0; (n < nlist->nri); n++)
     {
@@ -395,6 +402,29 @@ gmx_nb_generic_kernel(t_nblist *                nlist,
                         vvdw             = vvdw_disp + vvdw_rep;
                         break;
 
+
+                    case GMX_NBKERNEL_VDW_LJEWALD:
+                        /* LJ-PME */
+                        rinvsix          = rinvsq*rinvsq*rinvsq;
+                        ewcljrsq         = ewclj2*rsq;
+                        exponent         = exp(-ewcljrsq);
+                        poly             = exponent*(1.0 + ewcljrsq + ewcljrsq*ewcljrsq*0.5);
+                        c6               = vdwparam[tj];
+                        c12              = vdwparam[tj+1];
+                        c6grid           = vdwgridparam[tj];
+                        vvdw_disp        = (c6-c6grid*(1.0-poly))*rinvsix;
+                        vvdw_rep         = c12*rinvsix*rinvsix;
+                        fvdw             = (vvdw_rep - vvdw_disp - c6grid*(1.0/6.0)*exponent*ewclj6)*rinvsq;
+                        if (fr->vdw_modifier == eintmodPOTSHIFT)
+                        {
+                            vvdw             = (vvdw_rep + c12*sh_repulsion)/12.0 - (vvdw_disp + c6*sh_dispersion + c6grid*sh_lj_ewald)/6.0;
+                        }
+                        else
+                        {
+                            vvdw             = vvdw_rep/12.0-vvdw_disp/6.0;
+                        }
+                        break;
+
                     default:
                         gmx_fatal(FARGS, "Death & horror! No generic VdW interaction for ivdw=%d.\n", ivdw);
                         break;
index 2decb74091be888f4ffd15727337ce931dc3584e..7fafb5fc85fc48208041c480836efb6148a89cba 100755 (executable)
@@ -2,7 +2,7 @@
 #
 # This file is part of the GROMACS molecular simulation package.
 #
-# Copyright (c) 2012,2013, by the GROMACS development team, led by
+# Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
 # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 # and including many others, as listed in the AUTHORS file in the
 # top-level source directory and at http://www.gromacs.org.
@@ -120,6 +120,7 @@ VdwList = {
     'LennardJones'            : ['rinvsq'],
 #    'Buckingham'              : ['rinv','rinvsq','r'], # Disabled for sse4.1 to reduce number of kernels and simply the template
     'CubicSplineTable'        : ['rinv','r','table'],
+    'LJEwald'                 : ['rinv','rinvsq','r'],
 }
 
 
@@ -193,6 +194,7 @@ Abbreviation = {
     'CubicSplineTable'        : 'CSTab',
     'LennardJones'            : 'LJ',
     'Buckingham'              : 'Bham',
+    'LJEwald'                 : 'LJEw',
     'PotentialShift'          : 'Sh',
     'PotentialSwitch'         : 'Sw',
     'ExactCutoff'             : 'Cut',
@@ -282,7 +284,11 @@ def KeepKernel(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,Kernel
         return 0
     # For Vdw, we support switch and shift for Lennard-Jones/Buckingham
     if((KernelVdwMod=='ExactCutoff') or
-       (KernelVdwMod in ['PotentialShift','PotentialSwitch'] and KernelVdw not in ['LennardJones','Buckingham'])):
+       (KernelVdwMod in ['PotentialShift','PotentialSwitch'] and KernelVdw not in ['LennardJones','Buckingham','LJEwald'])):
+        return 0
+
+    # For LJEwald, we only support shift
+    if(KernelVdw=='LJEwald' and KernelVdwMod=='PotentialSwitch'):
         return 0
 
     # Choose either switch or shift and don't mix them...
@@ -302,6 +308,10 @@ def KeepKernel(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,Kernel
         elif(KernelElec!='ReactionField'):
             return 0
 
+    #Only do LJ-PME if we are also doing PME for electrostatics, or no electrostatics at all.
+    if(KernelVdw=='LJEwald' and KernelElec not in ['Ewald','None']):
+        return 0
+
     return 1
 
 
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_avx_128_fma_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_avx_128_fma_double.c
new file mode 100644 (file)
index 0000000..ed221ff
--- /dev/null
@@ -0,0 +1,751 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_128_fma_double.h"
+#include "kernelutil_x86_avx_128_fma_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_avx_128_fma_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_avx_128_fma_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    real             *vdwgridparam;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128d           one_half  = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_pd(facel,_mm_load1_pd(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_pd();
+        vvdwsum          = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq00,_mm_sub_pd(_mm_sub_pd(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_macc_pd(-c6grid_00,_mm_sub_pd(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_msub_pd(_mm_nmacc_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                              _mm_mul_pd(_mm_sub_pd(vvdw6,_mm_macc_pd(c6grid_00,sh_lj_ewald,_mm_mul_pd(c6_00,sh_vdw_invrcut6))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_add_pd(vvdw12,_mm_msub_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+            vvdw             = _mm_and_pd(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   _mm_mul_pd(dx00,fscal),
+                                                   _mm_mul_pd(dy00,fscal),
+                                                   _mm_mul_pd(dz00,fscal));
+
+            }
+
+            /* Inner loop uses 78 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _mm_load_sd(charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq00,_mm_sub_pd(_mm_sub_pd(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_macc_pd(-c6grid_00,_mm_sub_pd(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_msub_pd(_mm_nmacc_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                              _mm_mul_pd(_mm_sub_pd(vvdw6,_mm_macc_pd(c6grid_00,sh_lj_ewald,_mm_mul_pd(c6_00,sh_vdw_invrcut6))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_add_pd(vvdw12,_mm_msub_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+            vvdw             = _mm_and_pd(vvdw,cutoff_mask);
+            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,
+                                                   _mm_mul_pd(dx00,fscal),
+                                                   _mm_mul_pd(dy00,fscal),
+                                                   _mm_mul_pd(dz00,fscal));
+
+            }
+
+            /* Inner loop uses 78 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*78);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_avx_128_fma_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_avx_128_fma_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    real             *vdwgridparam;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128d           one_half  = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_pd(facel,_mm_load1_pd(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_macc_pd(_mm_msub_pd(c12_00,rinvsix,_mm_sub_pd(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   _mm_mul_pd(dx00,fscal),
+                                                   _mm_mul_pd(dy00,fscal),
+                                                   _mm_mul_pd(dz00,fscal));
+
+            }
+
+            /* Inner loop uses 63 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _mm_load_sd(charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_macc_pd(_mm_msub_pd(c12_00,rinvsix,_mm_sub_pd(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,
+                                                   _mm_mul_pd(dx00,fscal),
+                                                   _mm_mul_pd(dy00,fscal),
+                                                   _mm_mul_pd(dz00,fscal));
+
+            }
+
+            /* Inner loop uses 63 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*63);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_avx_128_fma_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_avx_128_fma_double.c
new file mode 100644 (file)
index 0000000..081b747
--- /dev/null
@@ -0,0 +1,1267 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_128_fma_double.h"
+#include "kernelutil_x86_avx_128_fma_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_avx_128_fma_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_avx_128_fma_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_10;
+    __m128d           c6grid_20;
+    real             *vdwgridparam;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128d           one_half  = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_pd();
+        vvdwsum          = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq00,_mm_sub_pd(_mm_sub_pd(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_macc_pd(-c6grid_00,_mm_sub_pd(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_msub_pd(_mm_nmacc_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                              _mm_mul_pd(_mm_sub_pd(vvdw6,_mm_macc_pd(c6grid_00,sh_lj_ewald,_mm_mul_pd(c6_00,sh_vdw_invrcut6))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_add_pd(vvdw12,_mm_msub_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+            vvdw             = _mm_and_pd(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq10,_mm_sub_pd(_mm_sub_pd(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx10,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
+            
+            fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq20,_mm_sub_pd(_mm_sub_pd(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx20,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
+            
+            fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
+
+            }
+
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 179 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _mm_load_sd(charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq00,_mm_sub_pd(_mm_sub_pd(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_macc_pd(-c6grid_00,_mm_sub_pd(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_msub_pd(_mm_nmacc_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                              _mm_mul_pd(_mm_sub_pd(vvdw6,_mm_macc_pd(c6grid_00,sh_lj_ewald,_mm_mul_pd(c6_00,sh_vdw_invrcut6))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_add_pd(vvdw12,_mm_msub_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+            vvdw             = _mm_and_pd(vvdw,cutoff_mask);
+            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq10,_mm_sub_pd(_mm_sub_pd(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx10,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
+            
+            fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq20,_mm_sub_pd(_mm_sub_pd(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx20,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
+            
+            fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
+
+            }
+
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 179 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*179);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_avx_128_fma_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_avx_128_fma_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_10;
+    __m128d           c6grid_20;
+    real             *vdwgridparam;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128d           one_half  = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_macc_pd(_mm_msub_pd(c12_00,rinvsix,_mm_sub_pd(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx10,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
+            
+            fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx20,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
+            
+            fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
+
+            }
+
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 150 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _mm_load_sd(charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_macc_pd(_mm_msub_pd(c12_00,rinvsix,_mm_sub_pd(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx10,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
+            
+            fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx20,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
+            
+            fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
+
+            }
+
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 150 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*150);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_avx_128_fma_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_avx_128_fma_double.c
new file mode 100644 (file)
index 0000000..7250c93
--- /dev/null
@@ -0,0 +1,2599 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_128_fma_double.h"
+#include "kernelutil_x86_avx_128_fma_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_avx_128_fma_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_avx_128_fma_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_01;
+    __m128d           c6grid_02;
+    __m128d           c6grid_10;
+    __m128d           c6grid_11;
+    __m128d           c6grid_12;
+    __m128d           c6grid_20;
+    __m128d           c6grid_21;
+    __m128d           c6grid_22;
+    real             *vdwgridparam;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128d           one_half  = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_pd(charge[inr+0]);
+    jq1              = _mm_set1_pd(charge[inr+1]);
+    jq2              = _mm_set1_pd(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_pd(iq0,jq0);
+    c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_pd(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq01             = _mm_mul_pd(iq0,jq1);
+    qq02             = _mm_mul_pd(iq0,jq2);
+    qq10             = _mm_mul_pd(iq1,jq0);
+    qq11             = _mm_mul_pd(iq1,jq1);
+    qq12             = _mm_mul_pd(iq1,jq2);
+    qq20             = _mm_mul_pd(iq2,jq0);
+    qq21             = _mm_mul_pd(iq2,jq1);
+    qq22             = _mm_mul_pd(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_pd();
+        vvdwsum          = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx01             = _mm_sub_pd(ix0,jx1);
+            dy01             = _mm_sub_pd(iy0,jy1);
+            dz01             = _mm_sub_pd(iz0,jz1);
+            dx02             = _mm_sub_pd(ix0,jx2);
+            dy02             = _mm_sub_pd(iy0,jy2);
+            dz02             = _mm_sub_pd(iz0,jz2);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv01           = gmx_mm_invsqrt_pd(rsq01);
+            rinv02           = gmx_mm_invsqrt_pd(rsq02);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq01         = _mm_mul_pd(rinv01,rinv01);
+            rinvsq02         = _mm_mul_pd(rinv02,rinv02);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq00,_mm_sub_pd(_mm_sub_pd(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_macc_pd(-c6grid_00,_mm_sub_pd(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_msub_pd(_mm_nmacc_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                              _mm_mul_pd(_mm_sub_pd(vvdw6,_mm_macc_pd(c6grid_00,sh_lj_ewald,_mm_mul_pd(c6_00,sh_vdw_invrcut6))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_add_pd(vvdw12,_mm_msub_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+            vvdw             = _mm_and_pd(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm_mul_pd(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r01,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq01,_mm_sub_pd(_mm_sub_pd(rinv01,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx01,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
+            
+            fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm_mul_pd(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r02,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq02,_mm_sub_pd(_mm_sub_pd(rinv02,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx02,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
+            
+            fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq10,_mm_sub_pd(_mm_sub_pd(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx10,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
+            
+            fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_sub_pd(rinv11,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx11,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
+            
+            fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_sub_pd(rinv12,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx12,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
+            
+            fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq20,_mm_sub_pd(_mm_sub_pd(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx20,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
+            
+            fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_sub_pd(rinv21,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx21,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
+            
+            fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_sub_pd(rinv22,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx22,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
+            
+            fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
+
+            }
+
+            gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 470 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx01             = _mm_sub_pd(ix0,jx1);
+            dy01             = _mm_sub_pd(iy0,jy1);
+            dz01             = _mm_sub_pd(iz0,jz1);
+            dx02             = _mm_sub_pd(ix0,jx2);
+            dy02             = _mm_sub_pd(iy0,jy2);
+            dz02             = _mm_sub_pd(iz0,jz2);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv01           = gmx_mm_invsqrt_pd(rsq01);
+            rinv02           = gmx_mm_invsqrt_pd(rsq02);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq01         = _mm_mul_pd(rinv01,rinv01);
+            rinvsq02         = _mm_mul_pd(rinv02,rinv02);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq00,_mm_sub_pd(_mm_sub_pd(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_macc_pd(-c6grid_00,_mm_sub_pd(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_msub_pd(_mm_nmacc_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                              _mm_mul_pd(_mm_sub_pd(vvdw6,_mm_macc_pd(c6grid_00,sh_lj_ewald,_mm_mul_pd(c6_00,sh_vdw_invrcut6))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_add_pd(vvdw12,_mm_msub_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+            vvdw             = _mm_and_pd(vvdw,cutoff_mask);
+            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm_mul_pd(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r01,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq01,_mm_sub_pd(_mm_sub_pd(rinv01,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx01,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
+            
+            fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm_mul_pd(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r02,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq02,_mm_sub_pd(_mm_sub_pd(rinv02,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx02,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
+            
+            fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq10,_mm_sub_pd(_mm_sub_pd(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx10,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
+            
+            fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_sub_pd(rinv11,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx11,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
+            
+            fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_sub_pd(rinv12,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx12,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
+            
+            fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq20,_mm_sub_pd(_mm_sub_pd(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx20,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
+            
+            fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_sub_pd(rinv21,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx21,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
+            
+            fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_sub_pd(rinv22,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx22,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
+            
+            fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
+
+            }
+
+            gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 470 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*470);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_avx_128_fma_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_avx_128_fma_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_01;
+    __m128d           c6grid_02;
+    __m128d           c6grid_10;
+    __m128d           c6grid_11;
+    __m128d           c6grid_12;
+    __m128d           c6grid_20;
+    __m128d           c6grid_21;
+    __m128d           c6grid_22;
+    real             *vdwgridparam;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128d           one_half  = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_pd(charge[inr+0]);
+    jq1              = _mm_set1_pd(charge[inr+1]);
+    jq2              = _mm_set1_pd(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_pd(iq0,jq0);
+    c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_pd(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq01             = _mm_mul_pd(iq0,jq1);
+    qq02             = _mm_mul_pd(iq0,jq2);
+    qq10             = _mm_mul_pd(iq1,jq0);
+    qq11             = _mm_mul_pd(iq1,jq1);
+    qq12             = _mm_mul_pd(iq1,jq2);
+    qq20             = _mm_mul_pd(iq2,jq0);
+    qq21             = _mm_mul_pd(iq2,jq1);
+    qq22             = _mm_mul_pd(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx01             = _mm_sub_pd(ix0,jx1);
+            dy01             = _mm_sub_pd(iy0,jy1);
+            dz01             = _mm_sub_pd(iz0,jz1);
+            dx02             = _mm_sub_pd(ix0,jx2);
+            dy02             = _mm_sub_pd(iy0,jy2);
+            dz02             = _mm_sub_pd(iz0,jz2);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv01           = gmx_mm_invsqrt_pd(rsq01);
+            rinv02           = gmx_mm_invsqrt_pd(rsq02);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq01         = _mm_mul_pd(rinv01,rinv01);
+            rinvsq02         = _mm_mul_pd(rinv02,rinv02);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_macc_pd(_mm_msub_pd(c12_00,rinvsix,_mm_sub_pd(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm_mul_pd(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r01,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx01,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
+            
+            fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm_mul_pd(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r02,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx02,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
+            
+            fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx10,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
+            
+            fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx11,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
+            
+            fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx12,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
+            
+            fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx20,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
+            
+            fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx21,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
+            
+            fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx22,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
+            
+            fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
+
+            }
+
+            gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 399 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx01             = _mm_sub_pd(ix0,jx1);
+            dy01             = _mm_sub_pd(iy0,jy1);
+            dz01             = _mm_sub_pd(iz0,jz1);
+            dx02             = _mm_sub_pd(ix0,jx2);
+            dy02             = _mm_sub_pd(iy0,jy2);
+            dz02             = _mm_sub_pd(iz0,jz2);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv01           = gmx_mm_invsqrt_pd(rsq01);
+            rinv02           = gmx_mm_invsqrt_pd(rsq02);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq01         = _mm_mul_pd(rinv01,rinv01);
+            rinvsq02         = _mm_mul_pd(rinv02,rinv02);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_macc_pd(_mm_msub_pd(c12_00,rinvsix,_mm_sub_pd(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm_mul_pd(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r01,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx01,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
+            
+            fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm_mul_pd(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r02,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx02,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
+            
+            fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx10,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
+            
+            fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx11,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
+            
+            fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx12,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
+            
+            fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx20,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
+            
+            fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx21,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
+            
+            fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx22,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
+            
+            fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
+
+            }
+
+            gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 399 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*399);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_avx_128_fma_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_avx_128_fma_double.c
new file mode 100644 (file)
index 0000000..b95e320
--- /dev/null
@@ -0,0 +1,1425 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_128_fma_double.h"
+#include "kernelutil_x86_avx_128_fma_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_avx_128_fma_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_avx_128_fma_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128d          dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_10;
+    __m128d           c6grid_20;
+    __m128d           c6grid_30;
+    real             *vdwgridparam;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128d           one_half  = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+        fix3             = _mm_setzero_pd();
+        fiy3             = _mm_setzero_pd();
+        fiz3             = _mm_setzero_pd();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_pd();
+        vvdwsum          = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx30             = _mm_sub_pd(ix3,jx0);
+            dy30             = _mm_sub_pd(iy3,jy0);
+            dz30             = _mm_sub_pd(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv30           = gmx_mm_invsqrt_pd(rsq30);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq30         = _mm_mul_pd(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_macc_pd(-c6grid_00,_mm_sub_pd(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_msub_pd(_mm_nmacc_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                              _mm_mul_pd(_mm_sub_pd(vvdw6,_mm_macc_pd(c6grid_00,sh_lj_ewald,_mm_mul_pd(c6_00,sh_vdw_invrcut6))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_add_pd(vvdw12,_mm_msub_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_pd(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq10,_mm_sub_pd(_mm_sub_pd(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx10,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
+            
+            fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq20,_mm_sub_pd(_mm_sub_pd(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx20,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
+            
+            fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm_mul_pd(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_pd(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r30,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq30,_mm_sub_pd(_mm_sub_pd(rinv30,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _mm_macc_pd(dx30,fscal,fix3);
+            fiy3             = _mm_macc_pd(dy30,fscal,fiy3);
+            fiz3             = _mm_macc_pd(dz30,fscal,fiz3);
+            
+            fjx0             = _mm_macc_pd(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz30,fscal,fjz0);
+
+            }
+
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 208 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx30             = _mm_sub_pd(ix3,jx0);
+            dy30             = _mm_sub_pd(iy3,jy0);
+            dz30             = _mm_sub_pd(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv30           = gmx_mm_invsqrt_pd(rsq30);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq30         = _mm_mul_pd(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _mm_load_sd(charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_macc_pd(-c6grid_00,_mm_sub_pd(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_msub_pd(_mm_nmacc_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                              _mm_mul_pd(_mm_sub_pd(vvdw6,_mm_macc_pd(c6grid_00,sh_lj_ewald,_mm_mul_pd(c6_00,sh_vdw_invrcut6))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_add_pd(vvdw12,_mm_msub_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_pd(vvdw,cutoff_mask);
+            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq10,_mm_sub_pd(_mm_sub_pd(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx10,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
+            
+            fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq20,_mm_sub_pd(_mm_sub_pd(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx20,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
+            
+            fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm_mul_pd(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_pd(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r30,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq30,_mm_sub_pd(_mm_sub_pd(rinv30,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix3             = _mm_macc_pd(dx30,fscal,fix3);
+            fiy3             = _mm_macc_pd(dy30,fscal,fiy3);
+            fiz3             = _mm_macc_pd(dz30,fscal,fiz3);
+            
+            fjx0             = _mm_macc_pd(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz30,fscal,fjz0);
+
+            }
+
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 208 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*208);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_avx_128_fma_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_avx_128_fma_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128d          dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_10;
+    __m128d           c6grid_20;
+    __m128d           c6grid_30;
+    real             *vdwgridparam;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128d           one_half  = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+        fix3             = _mm_setzero_pd();
+        fiy3             = _mm_setzero_pd();
+        fiz3             = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx30             = _mm_sub_pd(ix3,jx0);
+            dy30             = _mm_sub_pd(iy3,jy0);
+            dz30             = _mm_sub_pd(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv30           = gmx_mm_invsqrt_pd(rsq30);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq30         = _mm_mul_pd(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_macc_pd(_mm_msub_pd(c12_00,rinvsix,_mm_sub_pd(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx10,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
+            
+            fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx20,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
+            
+            fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm_mul_pd(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_pd(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r30,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _mm_macc_pd(dx30,fscal,fix3);
+            fiy3             = _mm_macc_pd(dy30,fscal,fiy3);
+            fiz3             = _mm_macc_pd(dz30,fscal,fiz3);
+            
+            fjx0             = _mm_macc_pd(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz30,fscal,fjz0);
+
+            }
+
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 179 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx30             = _mm_sub_pd(ix3,jx0);
+            dy30             = _mm_sub_pd(iy3,jy0);
+            dz30             = _mm_sub_pd(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv30           = gmx_mm_invsqrt_pd(rsq30);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq30         = _mm_mul_pd(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _mm_load_sd(charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_macc_pd(_mm_msub_pd(c12_00,rinvsix,_mm_sub_pd(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx10,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
+            
+            fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx20,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
+            
+            fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm_mul_pd(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_pd(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r30,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix3             = _mm_macc_pd(dx30,fscal,fix3);
+            fiy3             = _mm_macc_pd(dy30,fscal,fiy3);
+            fiz3             = _mm_macc_pd(dz30,fscal,fiz3);
+            
+            fjx0             = _mm_macc_pd(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz30,fscal,fjz0);
+
+            }
+
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 179 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*179);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_avx_128_fma_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_avx_128_fma_double.c
new file mode 100644 (file)
index 0000000..112077e
--- /dev/null
@@ -0,0 +1,2769 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_128_fma_double.h"
+#include "kernelutil_x86_avx_128_fma_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_avx_128_fma_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_avx_128_fma_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_11;
+    __m128d           c6grid_12;
+    __m128d           c6grid_13;
+    __m128d           c6grid_21;
+    __m128d           c6grid_22;
+    __m128d           c6grid_23;
+    __m128d           c6grid_31;
+    __m128d           c6grid_32;
+    __m128d           c6grid_33;
+    real             *vdwgridparam;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128d           one_half  = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_pd(charge[inr+1]);
+    jq2              = _mm_set1_pd(charge[inr+2]);
+    jq3              = _mm_set1_pd(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_pd(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq11             = _mm_mul_pd(iq1,jq1);
+    qq12             = _mm_mul_pd(iq1,jq2);
+    qq13             = _mm_mul_pd(iq1,jq3);
+    qq21             = _mm_mul_pd(iq2,jq1);
+    qq22             = _mm_mul_pd(iq2,jq2);
+    qq23             = _mm_mul_pd(iq2,jq3);
+    qq31             = _mm_mul_pd(iq3,jq1);
+    qq32             = _mm_mul_pd(iq3,jq2);
+    qq33             = _mm_mul_pd(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+        fix3             = _mm_setzero_pd();
+        fiy3             = _mm_setzero_pd();
+        fiz3             = _mm_setzero_pd();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_pd();
+        vvdwsum          = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx13             = _mm_sub_pd(ix1,jx3);
+            dy13             = _mm_sub_pd(iy1,jy3);
+            dz13             = _mm_sub_pd(iz1,jz3);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+            dx23             = _mm_sub_pd(ix2,jx3);
+            dy23             = _mm_sub_pd(iy2,jy3);
+            dz23             = _mm_sub_pd(iz2,jz3);
+            dx31             = _mm_sub_pd(ix3,jx1);
+            dy31             = _mm_sub_pd(iy3,jy1);
+            dz31             = _mm_sub_pd(iz3,jz1);
+            dx32             = _mm_sub_pd(ix3,jx2);
+            dy32             = _mm_sub_pd(iy3,jy2);
+            dz32             = _mm_sub_pd(iz3,jz2);
+            dx33             = _mm_sub_pd(ix3,jx3);
+            dy33             = _mm_sub_pd(iy3,jy3);
+            dz33             = _mm_sub_pd(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv13           = gmx_mm_invsqrt_pd(rsq13);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+            rinv23           = gmx_mm_invsqrt_pd(rsq23);
+            rinv31           = gmx_mm_invsqrt_pd(rsq31);
+            rinv32           = gmx_mm_invsqrt_pd(rsq32);
+            rinv33           = gmx_mm_invsqrt_pd(rsq33);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq13         = _mm_mul_pd(rinv13,rinv13);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+            rinvsq23         = _mm_mul_pd(rinv23,rinv23);
+            rinvsq31         = _mm_mul_pd(rinv31,rinv31);
+            rinvsq32         = _mm_mul_pd(rinv32,rinv32);
+            rinvsq33         = _mm_mul_pd(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+            fjx3             = _mm_setzero_pd();
+            fjy3             = _mm_setzero_pd();
+            fjz3             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_macc_pd(-c6grid_00,_mm_sub_pd(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_msub_pd(_mm_nmacc_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                              _mm_mul_pd(_mm_sub_pd(vvdw6,_mm_macc_pd(c6grid_00,sh_lj_ewald,_mm_mul_pd(c6_00,sh_vdw_invrcut6))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_add_pd(vvdw12,_mm_msub_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_pd(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_sub_pd(rinv11,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx11,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
+            
+            fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_sub_pd(rinv12,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx12,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
+            
+            fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm_mul_pd(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r13,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq13,_mm_sub_pd(_mm_sub_pd(rinv13,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx13,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy13,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz13,fscal,fiz1);
+            
+            fjx3             = _mm_macc_pd(dx13,fscal,fjx3);
+            fjy3             = _mm_macc_pd(dy13,fscal,fjy3);
+            fjz3             = _mm_macc_pd(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_sub_pd(rinv21,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx21,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
+            
+            fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_sub_pd(rinv22,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx22,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
+            
+            fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm_mul_pd(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r23,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq23,_mm_sub_pd(_mm_sub_pd(rinv23,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx23,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy23,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz23,fscal,fiz2);
+            
+            fjx3             = _mm_macc_pd(dx23,fscal,fjx3);
+            fjy3             = _mm_macc_pd(dy23,fscal,fjy3);
+            fjz3             = _mm_macc_pd(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm_mul_pd(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r31,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq31,_mm_sub_pd(_mm_sub_pd(rinv31,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _mm_macc_pd(dx31,fscal,fix3);
+            fiy3             = _mm_macc_pd(dy31,fscal,fiy3);
+            fiz3             = _mm_macc_pd(dz31,fscal,fiz3);
+            
+            fjx1             = _mm_macc_pd(dx31,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy31,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm_mul_pd(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r32,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq32,_mm_sub_pd(_mm_sub_pd(rinv32,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _mm_macc_pd(dx32,fscal,fix3);
+            fiy3             = _mm_macc_pd(dy32,fscal,fiy3);
+            fiz3             = _mm_macc_pd(dz32,fscal,fiz3);
+            
+            fjx2             = _mm_macc_pd(dx32,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy32,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm_mul_pd(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r33,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq33,_mm_sub_pd(_mm_sub_pd(rinv33,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _mm_macc_pd(dx33,fscal,fix3);
+            fiy3             = _mm_macc_pd(dy33,fscal,fiy3);
+            fiz3             = _mm_macc_pd(dz33,fscal,fiz3);
+            
+            fjx3             = _mm_macc_pd(dx33,fscal,fjx3);
+            fjy3             = _mm_macc_pd(dy33,fscal,fjy3);
+            fjz3             = _mm_macc_pd(dz33,fscal,fjz3);
+
+            }
+
+            gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 502 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx13             = _mm_sub_pd(ix1,jx3);
+            dy13             = _mm_sub_pd(iy1,jy3);
+            dz13             = _mm_sub_pd(iz1,jz3);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+            dx23             = _mm_sub_pd(ix2,jx3);
+            dy23             = _mm_sub_pd(iy2,jy3);
+            dz23             = _mm_sub_pd(iz2,jz3);
+            dx31             = _mm_sub_pd(ix3,jx1);
+            dy31             = _mm_sub_pd(iy3,jy1);
+            dz31             = _mm_sub_pd(iz3,jz1);
+            dx32             = _mm_sub_pd(ix3,jx2);
+            dy32             = _mm_sub_pd(iy3,jy2);
+            dz32             = _mm_sub_pd(iz3,jz2);
+            dx33             = _mm_sub_pd(ix3,jx3);
+            dy33             = _mm_sub_pd(iy3,jy3);
+            dz33             = _mm_sub_pd(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv13           = gmx_mm_invsqrt_pd(rsq13);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+            rinv23           = gmx_mm_invsqrt_pd(rsq23);
+            rinv31           = gmx_mm_invsqrt_pd(rsq31);
+            rinv32           = gmx_mm_invsqrt_pd(rsq32);
+            rinv33           = gmx_mm_invsqrt_pd(rsq33);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq13         = _mm_mul_pd(rinv13,rinv13);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+            rinvsq23         = _mm_mul_pd(rinv23,rinv23);
+            rinvsq31         = _mm_mul_pd(rinv31,rinv31);
+            rinvsq32         = _mm_mul_pd(rinv32,rinv32);
+            rinvsq33         = _mm_mul_pd(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+            fjx3             = _mm_setzero_pd();
+            fjy3             = _mm_setzero_pd();
+            fjz3             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_macc_pd(-c6grid_00,_mm_sub_pd(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_msub_pd(_mm_nmacc_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                              _mm_mul_pd(_mm_sub_pd(vvdw6,_mm_macc_pd(c6grid_00,sh_lj_ewald,_mm_mul_pd(c6_00,sh_vdw_invrcut6))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_add_pd(vvdw12,_mm_msub_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_pd(vvdw,cutoff_mask);
+            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_sub_pd(rinv11,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx11,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
+            
+            fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_sub_pd(rinv12,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx12,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
+            
+            fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm_mul_pd(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r13,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq13,_mm_sub_pd(_mm_sub_pd(rinv13,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx13,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy13,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz13,fscal,fiz1);
+            
+            fjx3             = _mm_macc_pd(dx13,fscal,fjx3);
+            fjy3             = _mm_macc_pd(dy13,fscal,fjy3);
+            fjz3             = _mm_macc_pd(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_sub_pd(rinv21,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx21,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
+            
+            fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_sub_pd(rinv22,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx22,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
+            
+            fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm_mul_pd(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r23,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq23,_mm_sub_pd(_mm_sub_pd(rinv23,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx23,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy23,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz23,fscal,fiz2);
+            
+            fjx3             = _mm_macc_pd(dx23,fscal,fjx3);
+            fjy3             = _mm_macc_pd(dy23,fscal,fjy3);
+            fjz3             = _mm_macc_pd(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm_mul_pd(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r31,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq31,_mm_sub_pd(_mm_sub_pd(rinv31,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix3             = _mm_macc_pd(dx31,fscal,fix3);
+            fiy3             = _mm_macc_pd(dy31,fscal,fiy3);
+            fiz3             = _mm_macc_pd(dz31,fscal,fiz3);
+            
+            fjx1             = _mm_macc_pd(dx31,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy31,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm_mul_pd(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r32,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq32,_mm_sub_pd(_mm_sub_pd(rinv32,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix3             = _mm_macc_pd(dx32,fscal,fix3);
+            fiy3             = _mm_macc_pd(dy32,fscal,fiy3);
+            fiz3             = _mm_macc_pd(dz32,fscal,fiz3);
+            
+            fjx2             = _mm_macc_pd(dx32,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy32,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm_mul_pd(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r33,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq33,_mm_sub_pd(_mm_sub_pd(rinv33,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix3             = _mm_macc_pd(dx33,fscal,fix3);
+            fiy3             = _mm_macc_pd(dy33,fscal,fiy3);
+            fiz3             = _mm_macc_pd(dz33,fscal,fiz3);
+            
+            fjx3             = _mm_macc_pd(dx33,fscal,fjx3);
+            fjy3             = _mm_macc_pd(dy33,fscal,fjy3);
+            fjz3             = _mm_macc_pd(dz33,fscal,fjz3);
+
+            }
+
+            gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 502 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*502);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_avx_128_fma_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_avx_128_fma_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_11;
+    __m128d           c6grid_12;
+    __m128d           c6grid_13;
+    __m128d           c6grid_21;
+    __m128d           c6grid_22;
+    __m128d           c6grid_23;
+    __m128d           c6grid_31;
+    __m128d           c6grid_32;
+    __m128d           c6grid_33;
+    real             *vdwgridparam;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128d           one_half  = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_pd(charge[inr+1]);
+    jq2              = _mm_set1_pd(charge[inr+2]);
+    jq3              = _mm_set1_pd(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_pd(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq11             = _mm_mul_pd(iq1,jq1);
+    qq12             = _mm_mul_pd(iq1,jq2);
+    qq13             = _mm_mul_pd(iq1,jq3);
+    qq21             = _mm_mul_pd(iq2,jq1);
+    qq22             = _mm_mul_pd(iq2,jq2);
+    qq23             = _mm_mul_pd(iq2,jq3);
+    qq31             = _mm_mul_pd(iq3,jq1);
+    qq32             = _mm_mul_pd(iq3,jq2);
+    qq33             = _mm_mul_pd(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+        fix3             = _mm_setzero_pd();
+        fiy3             = _mm_setzero_pd();
+        fiz3             = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx13             = _mm_sub_pd(ix1,jx3);
+            dy13             = _mm_sub_pd(iy1,jy3);
+            dz13             = _mm_sub_pd(iz1,jz3);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+            dx23             = _mm_sub_pd(ix2,jx3);
+            dy23             = _mm_sub_pd(iy2,jy3);
+            dz23             = _mm_sub_pd(iz2,jz3);
+            dx31             = _mm_sub_pd(ix3,jx1);
+            dy31             = _mm_sub_pd(iy3,jy1);
+            dz31             = _mm_sub_pd(iz3,jz1);
+            dx32             = _mm_sub_pd(ix3,jx2);
+            dy32             = _mm_sub_pd(iy3,jy2);
+            dz32             = _mm_sub_pd(iz3,jz2);
+            dx33             = _mm_sub_pd(ix3,jx3);
+            dy33             = _mm_sub_pd(iy3,jy3);
+            dz33             = _mm_sub_pd(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv13           = gmx_mm_invsqrt_pd(rsq13);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+            rinv23           = gmx_mm_invsqrt_pd(rsq23);
+            rinv31           = gmx_mm_invsqrt_pd(rsq31);
+            rinv32           = gmx_mm_invsqrt_pd(rsq32);
+            rinv33           = gmx_mm_invsqrt_pd(rsq33);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq13         = _mm_mul_pd(rinv13,rinv13);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+            rinvsq23         = _mm_mul_pd(rinv23,rinv23);
+            rinvsq31         = _mm_mul_pd(rinv31,rinv31);
+            rinvsq32         = _mm_mul_pd(rinv32,rinv32);
+            rinvsq33         = _mm_mul_pd(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+            fjx3             = _mm_setzero_pd();
+            fjy3             = _mm_setzero_pd();
+            fjz3             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_macc_pd(_mm_msub_pd(c12_00,rinvsix,_mm_sub_pd(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx11,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
+            
+            fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx12,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
+            
+            fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm_mul_pd(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r13,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx13,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy13,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz13,fscal,fiz1);
+            
+            fjx3             = _mm_macc_pd(dx13,fscal,fjx3);
+            fjy3             = _mm_macc_pd(dy13,fscal,fjy3);
+            fjz3             = _mm_macc_pd(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx21,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
+            
+            fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx22,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
+            
+            fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm_mul_pd(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r23,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx23,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy23,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz23,fscal,fiz2);
+            
+            fjx3             = _mm_macc_pd(dx23,fscal,fjx3);
+            fjy3             = _mm_macc_pd(dy23,fscal,fjy3);
+            fjz3             = _mm_macc_pd(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm_mul_pd(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r31,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _mm_macc_pd(dx31,fscal,fix3);
+            fiy3             = _mm_macc_pd(dy31,fscal,fiy3);
+            fiz3             = _mm_macc_pd(dz31,fscal,fiz3);
+            
+            fjx1             = _mm_macc_pd(dx31,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy31,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm_mul_pd(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r32,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _mm_macc_pd(dx32,fscal,fix3);
+            fiy3             = _mm_macc_pd(dy32,fscal,fiy3);
+            fiz3             = _mm_macc_pd(dz32,fscal,fiz3);
+            
+            fjx2             = _mm_macc_pd(dx32,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy32,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm_mul_pd(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r33,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _mm_macc_pd(dx33,fscal,fix3);
+            fiy3             = _mm_macc_pd(dy33,fscal,fiy3);
+            fiz3             = _mm_macc_pd(dz33,fscal,fiz3);
+            
+            fjx3             = _mm_macc_pd(dx33,fscal,fjx3);
+            fjy3             = _mm_macc_pd(dy33,fscal,fjy3);
+            fjz3             = _mm_macc_pd(dz33,fscal,fjz3);
+
+            }
+
+            gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 431 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx13             = _mm_sub_pd(ix1,jx3);
+            dy13             = _mm_sub_pd(iy1,jy3);
+            dz13             = _mm_sub_pd(iz1,jz3);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+            dx23             = _mm_sub_pd(ix2,jx3);
+            dy23             = _mm_sub_pd(iy2,jy3);
+            dz23             = _mm_sub_pd(iz2,jz3);
+            dx31             = _mm_sub_pd(ix3,jx1);
+            dy31             = _mm_sub_pd(iy3,jy1);
+            dz31             = _mm_sub_pd(iz3,jz1);
+            dx32             = _mm_sub_pd(ix3,jx2);
+            dy32             = _mm_sub_pd(iy3,jy2);
+            dz32             = _mm_sub_pd(iz3,jz2);
+            dx33             = _mm_sub_pd(ix3,jx3);
+            dy33             = _mm_sub_pd(iy3,jy3);
+            dz33             = _mm_sub_pd(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv13           = gmx_mm_invsqrt_pd(rsq13);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+            rinv23           = gmx_mm_invsqrt_pd(rsq23);
+            rinv31           = gmx_mm_invsqrt_pd(rsq31);
+            rinv32           = gmx_mm_invsqrt_pd(rsq32);
+            rinv33           = gmx_mm_invsqrt_pd(rsq33);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq13         = _mm_mul_pd(rinv13,rinv13);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+            rinvsq23         = _mm_mul_pd(rinv23,rinv23);
+            rinvsq31         = _mm_mul_pd(rinv31,rinv31);
+            rinvsq32         = _mm_mul_pd(rinv32,rinv32);
+            rinvsq33         = _mm_mul_pd(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+            fjx3             = _mm_setzero_pd();
+            fjy3             = _mm_setzero_pd();
+            fjz3             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_macc_pd(_mm_msub_pd(c12_00,rinvsix,_mm_sub_pd(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx11,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
+            
+            fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx12,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
+            
+            fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm_mul_pd(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r13,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx13,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy13,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz13,fscal,fiz1);
+            
+            fjx3             = _mm_macc_pd(dx13,fscal,fjx3);
+            fjy3             = _mm_macc_pd(dy13,fscal,fjy3);
+            fjz3             = _mm_macc_pd(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx21,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
+            
+            fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx22,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
+            
+            fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm_mul_pd(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r23,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx23,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy23,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz23,fscal,fiz2);
+            
+            fjx3             = _mm_macc_pd(dx23,fscal,fjx3);
+            fjy3             = _mm_macc_pd(dy23,fscal,fjy3);
+            fjz3             = _mm_macc_pd(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm_mul_pd(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r31,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix3             = _mm_macc_pd(dx31,fscal,fix3);
+            fiy3             = _mm_macc_pd(dy31,fscal,fiy3);
+            fiz3             = _mm_macc_pd(dz31,fscal,fiz3);
+            
+            fjx1             = _mm_macc_pd(dx31,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy31,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm_mul_pd(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r32,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix3             = _mm_macc_pd(dx32,fscal,fix3);
+            fiy3             = _mm_macc_pd(dy32,fscal,fiy3);
+            fiz3             = _mm_macc_pd(dz32,fscal,fiz3);
+            
+            fjx2             = _mm_macc_pd(dx32,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy32,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm_mul_pd(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r33,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix3             = _mm_macc_pd(dx33,fscal,fix3);
+            fiy3             = _mm_macc_pd(dy33,fscal,fiy3);
+            fiz3             = _mm_macc_pd(dz33,fscal,fiz3);
+            
+            fjx3             = _mm_macc_pd(dx33,fscal,fjx3);
+            fjy3             = _mm_macc_pd(dy33,fscal,fjy3);
+            fjz3             = _mm_macc_pd(dz33,fscal,fjz3);
+
+            }
+
+            gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 431 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*431);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecEw_VdwLJEw_GeomP1P1_avx_128_fma_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecEw_VdwLJEw_GeomP1P1_avx_128_fma_double.c
new file mode 100644 (file)
index 0000000..6cece91
--- /dev/null
@@ -0,0 +1,693 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_128_fma_double.h"
+#include "kernelutil_x86_avx_128_fma_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_avx_128_fma_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_avx_128_fma_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    real             *vdwgridparam;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128d           one_half  = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_pd(facel,_mm_load1_pd(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_pd();
+        vvdwsum          = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_macc_pd(-c6grid_00,_mm_sub_pd(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_msub_pd(vvdw12,one_twelfth,_mm_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_add_pd(vvdw12,_mm_msub_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   _mm_mul_pd(dx00,fscal),
+                                                   _mm_mul_pd(dy00,fscal),
+                                                   _mm_mul_pd(dz00,fscal));
+
+            /* Inner loop uses 68 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _mm_load_sd(charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_macc_pd(-c6grid_00,_mm_sub_pd(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_msub_pd(vvdw12,one_twelfth,_mm_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_add_pd(vvdw12,_mm_msub_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,
+                                                   _mm_mul_pd(dx00,fscal),
+                                                   _mm_mul_pd(dy00,fscal),
+                                                   _mm_mul_pd(dz00,fscal));
+
+            /* Inner loop uses 68 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*68);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_avx_128_fma_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_avx_128_fma_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    real             *vdwgridparam;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128d           one_half  = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_pd(facel,_mm_load1_pd(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_macc_pd(_mm_msub_pd(c12_00,rinvsix,_mm_sub_pd(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   _mm_mul_pd(dx00,fscal),
+                                                   _mm_mul_pd(dy00,fscal),
+                                                   _mm_mul_pd(dz00,fscal));
+
+            /* Inner loop uses 60 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _mm_load_sd(charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_macc_pd(_mm_msub_pd(c12_00,rinvsix,_mm_sub_pd(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,
+                                                   _mm_mul_pd(dx00,fscal),
+                                                   _mm_mul_pd(dy00,fscal),
+                                                   _mm_mul_pd(dz00,fscal));
+
+            /* Inner loop uses 60 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*60);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecEw_VdwLJEw_GeomW3P1_avx_128_fma_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecEw_VdwLJEw_GeomW3P1_avx_128_fma_double.c
new file mode 100644 (file)
index 0000000..f160968
--- /dev/null
@@ -0,0 +1,1133 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_128_fma_double.h"
+#include "kernelutil_x86_avx_128_fma_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_avx_128_fma_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_avx_128_fma_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_10;
+    __m128d           c6grid_20;
+    real             *vdwgridparam;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128d           one_half  = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_pd();
+        vvdwsum          = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_macc_pd(-c6grid_00,_mm_sub_pd(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_msub_pd(vvdw12,one_twelfth,_mm_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_add_pd(vvdw12,_mm_msub_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx10,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
+            
+            fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx20,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
+            
+            fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
+
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 159 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _mm_load_sd(charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_macc_pd(-c6grid_00,_mm_sub_pd(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_msub_pd(vvdw12,one_twelfth,_mm_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_add_pd(vvdw12,_mm_msub_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx10,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
+            
+            fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx20,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
+            
+            fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
+
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 159 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*159);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_avx_128_fma_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_avx_128_fma_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_10;
+    __m128d           c6grid_20;
+    real             *vdwgridparam;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128d           one_half  = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_macc_pd(_mm_msub_pd(c12_00,rinvsix,_mm_sub_pd(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx10,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
+            
+            fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx20,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
+            
+            fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
+
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 141 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _mm_load_sd(charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_macc_pd(_mm_msub_pd(c12_00,rinvsix,_mm_sub_pd(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx10,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
+            
+            fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx20,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
+            
+            fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
+
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 141 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*141);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecEw_VdwLJEw_GeomW3W3_avx_128_fma_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecEw_VdwLJEw_GeomW3W3_avx_128_fma_double.c
new file mode 100644 (file)
index 0000000..c39382a
--- /dev/null
@@ -0,0 +1,2237 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_128_fma_double.h"
+#include "kernelutil_x86_avx_128_fma_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_avx_128_fma_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_avx_128_fma_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_01;
+    __m128d           c6grid_02;
+    __m128d           c6grid_10;
+    __m128d           c6grid_11;
+    __m128d           c6grid_12;
+    __m128d           c6grid_20;
+    __m128d           c6grid_21;
+    __m128d           c6grid_22;
+    real             *vdwgridparam;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128d           one_half  = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_pd(charge[inr+0]);
+    jq1              = _mm_set1_pd(charge[inr+1]);
+    jq2              = _mm_set1_pd(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_pd(iq0,jq0);
+    c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_pd(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq01             = _mm_mul_pd(iq0,jq1);
+    qq02             = _mm_mul_pd(iq0,jq2);
+    qq10             = _mm_mul_pd(iq1,jq0);
+    qq11             = _mm_mul_pd(iq1,jq1);
+    qq12             = _mm_mul_pd(iq1,jq2);
+    qq20             = _mm_mul_pd(iq2,jq0);
+    qq21             = _mm_mul_pd(iq2,jq1);
+    qq22             = _mm_mul_pd(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_pd();
+        vvdwsum          = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx01             = _mm_sub_pd(ix0,jx1);
+            dy01             = _mm_sub_pd(iy0,jy1);
+            dz01             = _mm_sub_pd(iz0,jz1);
+            dx02             = _mm_sub_pd(ix0,jx2);
+            dy02             = _mm_sub_pd(iy0,jy2);
+            dz02             = _mm_sub_pd(iz0,jz2);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv01           = gmx_mm_invsqrt_pd(rsq01);
+            rinv02           = gmx_mm_invsqrt_pd(rsq02);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq01         = _mm_mul_pd(rinv01,rinv01);
+            rinvsq02         = _mm_mul_pd(rinv02,rinv02);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_macc_pd(-c6grid_00,_mm_sub_pd(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_msub_pd(vvdw12,one_twelfth,_mm_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_add_pd(vvdw12,_mm_msub_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_pd(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r01,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx01,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
+            
+            fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_pd(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r02,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx02,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
+            
+            fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx10,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
+            
+            fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx11,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
+            
+            fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx12,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
+            
+            fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx20,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
+            
+            fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx21,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
+            
+            fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx22,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
+            
+            fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
+
+            gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 420 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx01             = _mm_sub_pd(ix0,jx1);
+            dy01             = _mm_sub_pd(iy0,jy1);
+            dz01             = _mm_sub_pd(iz0,jz1);
+            dx02             = _mm_sub_pd(ix0,jx2);
+            dy02             = _mm_sub_pd(iy0,jy2);
+            dz02             = _mm_sub_pd(iz0,jz2);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv01           = gmx_mm_invsqrt_pd(rsq01);
+            rinv02           = gmx_mm_invsqrt_pd(rsq02);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq01         = _mm_mul_pd(rinv01,rinv01);
+            rinvsq02         = _mm_mul_pd(rinv02,rinv02);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_macc_pd(-c6grid_00,_mm_sub_pd(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_msub_pd(vvdw12,one_twelfth,_mm_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_add_pd(vvdw12,_mm_msub_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_pd(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r01,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx01,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
+            
+            fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_pd(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r02,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx02,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
+            
+            fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx10,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
+            
+            fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx11,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
+            
+            fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx12,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
+            
+            fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx20,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
+            
+            fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx21,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
+            
+            fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx22,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
+            
+            fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
+
+            gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 420 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*420);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_avx_128_fma_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_avx_128_fma_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_01;
+    __m128d           c6grid_02;
+    __m128d           c6grid_10;
+    __m128d           c6grid_11;
+    __m128d           c6grid_12;
+    __m128d           c6grid_20;
+    __m128d           c6grid_21;
+    __m128d           c6grid_22;
+    real             *vdwgridparam;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128d           one_half  = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_pd(charge[inr+0]);
+    jq1              = _mm_set1_pd(charge[inr+1]);
+    jq2              = _mm_set1_pd(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_pd(iq0,jq0);
+    c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_pd(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq01             = _mm_mul_pd(iq0,jq1);
+    qq02             = _mm_mul_pd(iq0,jq2);
+    qq10             = _mm_mul_pd(iq1,jq0);
+    qq11             = _mm_mul_pd(iq1,jq1);
+    qq12             = _mm_mul_pd(iq1,jq2);
+    qq20             = _mm_mul_pd(iq2,jq0);
+    qq21             = _mm_mul_pd(iq2,jq1);
+    qq22             = _mm_mul_pd(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx01             = _mm_sub_pd(ix0,jx1);
+            dy01             = _mm_sub_pd(iy0,jy1);
+            dz01             = _mm_sub_pd(iz0,jz1);
+            dx02             = _mm_sub_pd(ix0,jx2);
+            dy02             = _mm_sub_pd(iy0,jy2);
+            dz02             = _mm_sub_pd(iz0,jz2);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv01           = gmx_mm_invsqrt_pd(rsq01);
+            rinv02           = gmx_mm_invsqrt_pd(rsq02);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq01         = _mm_mul_pd(rinv01,rinv01);
+            rinvsq02         = _mm_mul_pd(rinv02,rinv02);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_macc_pd(_mm_msub_pd(c12_00,rinvsix,_mm_sub_pd(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_pd(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r01,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx01,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
+            
+            fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_pd(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r02,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx02,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
+            
+            fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx10,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
+            
+            fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx11,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
+            
+            fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx12,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
+            
+            fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx20,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
+            
+            fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx21,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
+            
+            fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx22,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
+            
+            fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
+
+            gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 372 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx01             = _mm_sub_pd(ix0,jx1);
+            dy01             = _mm_sub_pd(iy0,jy1);
+            dz01             = _mm_sub_pd(iz0,jz1);
+            dx02             = _mm_sub_pd(ix0,jx2);
+            dy02             = _mm_sub_pd(iy0,jy2);
+            dz02             = _mm_sub_pd(iz0,jz2);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv01           = gmx_mm_invsqrt_pd(rsq01);
+            rinv02           = gmx_mm_invsqrt_pd(rsq02);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq01         = _mm_mul_pd(rinv01,rinv01);
+            rinvsq02         = _mm_mul_pd(rinv02,rinv02);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_macc_pd(_mm_msub_pd(c12_00,rinvsix,_mm_sub_pd(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_pd(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r01,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx01,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
+            
+            fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_pd(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r02,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx02,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
+            
+            fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx10,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
+            
+            fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx11,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
+            
+            fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx12,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
+            
+            fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx20,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
+            
+            fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx21,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
+            
+            fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx22,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
+            
+            fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
+
+            gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 372 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*372);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecEw_VdwLJEw_GeomW4P1_avx_128_fma_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecEw_VdwLJEw_GeomW4P1_avx_128_fma_double.c
new file mode 100644 (file)
index 0000000..6f482e0
--- /dev/null
@@ -0,0 +1,1255 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_128_fma_double.h"
+#include "kernelutil_x86_avx_128_fma_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_avx_128_fma_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_avx_128_fma_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128d          dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_10;
+    __m128d           c6grid_20;
+    __m128d           c6grid_30;
+    real             *vdwgridparam;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128d           one_half  = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+        fix3             = _mm_setzero_pd();
+        fiy3             = _mm_setzero_pd();
+        fiz3             = _mm_setzero_pd();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_pd();
+        vvdwsum          = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx30             = _mm_sub_pd(ix3,jx0);
+            dy30             = _mm_sub_pd(iy3,jy0);
+            dz30             = _mm_sub_pd(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv30           = gmx_mm_invsqrt_pd(rsq30);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq30         = _mm_mul_pd(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_macc_pd(-c6grid_00,_mm_sub_pd(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_msub_pd(vvdw12,one_twelfth,_mm_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_add_pd(vvdw12,_mm_msub_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx10,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
+            
+            fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx20,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
+            
+            fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_pd(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_pd(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r30,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq30,_mm_sub_pd(rinv30,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _mm_macc_pd(dx30,fscal,fix3);
+            fiy3             = _mm_macc_pd(dy30,fscal,fiy3);
+            fiz3             = _mm_macc_pd(dz30,fscal,fiz3);
+            
+            fjx0             = _mm_macc_pd(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz30,fscal,fjz0);
+
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 185 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx30             = _mm_sub_pd(ix3,jx0);
+            dy30             = _mm_sub_pd(iy3,jy0);
+            dz30             = _mm_sub_pd(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv30           = gmx_mm_invsqrt_pd(rsq30);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq30         = _mm_mul_pd(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _mm_load_sd(charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_macc_pd(-c6grid_00,_mm_sub_pd(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_msub_pd(vvdw12,one_twelfth,_mm_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_add_pd(vvdw12,_mm_msub_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx10,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
+            
+            fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx20,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
+            
+            fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_pd(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_pd(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r30,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq30,_mm_sub_pd(rinv30,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix3             = _mm_macc_pd(dx30,fscal,fix3);
+            fiy3             = _mm_macc_pd(dy30,fscal,fiy3);
+            fiz3             = _mm_macc_pd(dz30,fscal,fiz3);
+            
+            fjx0             = _mm_macc_pd(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz30,fscal,fjz0);
+
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 185 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*185);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_avx_128_fma_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_avx_128_fma_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128d          dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_10;
+    __m128d           c6grid_20;
+    __m128d           c6grid_30;
+    real             *vdwgridparam;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128d           one_half  = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+        fix3             = _mm_setzero_pd();
+        fiy3             = _mm_setzero_pd();
+        fiz3             = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx30             = _mm_sub_pd(ix3,jx0);
+            dy30             = _mm_sub_pd(iy3,jy0);
+            dz30             = _mm_sub_pd(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv30           = gmx_mm_invsqrt_pd(rsq30);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq30         = _mm_mul_pd(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_macc_pd(_mm_msub_pd(c12_00,rinvsix,_mm_sub_pd(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx10,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
+            
+            fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx20,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
+            
+            fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_pd(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_pd(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r30,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _mm_macc_pd(dx30,fscal,fix3);
+            fiy3             = _mm_macc_pd(dy30,fscal,fiy3);
+            fiz3             = _mm_macc_pd(dz30,fscal,fiz3);
+            
+            fjx0             = _mm_macc_pd(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz30,fscal,fjz0);
+
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 167 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx30             = _mm_sub_pd(ix3,jx0);
+            dy30             = _mm_sub_pd(iy3,jy0);
+            dz30             = _mm_sub_pd(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv30           = gmx_mm_invsqrt_pd(rsq30);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq30         = _mm_mul_pd(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _mm_load_sd(charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_macc_pd(_mm_msub_pd(c12_00,rinvsix,_mm_sub_pd(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx10,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
+            
+            fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx20,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
+            
+            fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_pd(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_pd(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r30,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix3             = _mm_macc_pd(dx30,fscal,fix3);
+            fiy3             = _mm_macc_pd(dy30,fscal,fiy3);
+            fiz3             = _mm_macc_pd(dz30,fscal,fiz3);
+            
+            fjx0             = _mm_macc_pd(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz30,fscal,fjz0);
+
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 167 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*167);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecEw_VdwLJEw_GeomW4W4_avx_128_fma_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecEw_VdwLJEw_GeomW4W4_avx_128_fma_double.c
new file mode 100644 (file)
index 0000000..27fe4f1
--- /dev/null
@@ -0,0 +1,2371 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_128_fma_double.h"
+#include "kernelutil_x86_avx_128_fma_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_avx_128_fma_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_avx_128_fma_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_11;
+    __m128d           c6grid_12;
+    __m128d           c6grid_13;
+    __m128d           c6grid_21;
+    __m128d           c6grid_22;
+    __m128d           c6grid_23;
+    __m128d           c6grid_31;
+    __m128d           c6grid_32;
+    __m128d           c6grid_33;
+    real             *vdwgridparam;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128d           one_half  = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_pd(charge[inr+1]);
+    jq2              = _mm_set1_pd(charge[inr+2]);
+    jq3              = _mm_set1_pd(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_pd(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq11             = _mm_mul_pd(iq1,jq1);
+    qq12             = _mm_mul_pd(iq1,jq2);
+    qq13             = _mm_mul_pd(iq1,jq3);
+    qq21             = _mm_mul_pd(iq2,jq1);
+    qq22             = _mm_mul_pd(iq2,jq2);
+    qq23             = _mm_mul_pd(iq2,jq3);
+    qq31             = _mm_mul_pd(iq3,jq1);
+    qq32             = _mm_mul_pd(iq3,jq2);
+    qq33             = _mm_mul_pd(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+        fix3             = _mm_setzero_pd();
+        fiy3             = _mm_setzero_pd();
+        fiz3             = _mm_setzero_pd();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_pd();
+        vvdwsum          = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx13             = _mm_sub_pd(ix1,jx3);
+            dy13             = _mm_sub_pd(iy1,jy3);
+            dz13             = _mm_sub_pd(iz1,jz3);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+            dx23             = _mm_sub_pd(ix2,jx3);
+            dy23             = _mm_sub_pd(iy2,jy3);
+            dz23             = _mm_sub_pd(iz2,jz3);
+            dx31             = _mm_sub_pd(ix3,jx1);
+            dy31             = _mm_sub_pd(iy3,jy1);
+            dz31             = _mm_sub_pd(iz3,jz1);
+            dx32             = _mm_sub_pd(ix3,jx2);
+            dy32             = _mm_sub_pd(iy3,jy2);
+            dz32             = _mm_sub_pd(iz3,jz2);
+            dx33             = _mm_sub_pd(ix3,jx3);
+            dy33             = _mm_sub_pd(iy3,jy3);
+            dz33             = _mm_sub_pd(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv13           = gmx_mm_invsqrt_pd(rsq13);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+            rinv23           = gmx_mm_invsqrt_pd(rsq23);
+            rinv31           = gmx_mm_invsqrt_pd(rsq31);
+            rinv32           = gmx_mm_invsqrt_pd(rsq32);
+            rinv33           = gmx_mm_invsqrt_pd(rsq33);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq13         = _mm_mul_pd(rinv13,rinv13);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+            rinvsq23         = _mm_mul_pd(rinv23,rinv23);
+            rinvsq31         = _mm_mul_pd(rinv31,rinv31);
+            rinvsq32         = _mm_mul_pd(rinv32,rinv32);
+            rinvsq33         = _mm_mul_pd(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+            fjx3             = _mm_setzero_pd();
+            fjy3             = _mm_setzero_pd();
+            fjz3             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_macc_pd(-c6grid_00,_mm_sub_pd(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_msub_pd(vvdw12,one_twelfth,_mm_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_add_pd(vvdw12,_mm_msub_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx11,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
+            
+            fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx12,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
+            
+            fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_pd(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r13,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx13,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy13,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz13,fscal,fiz1);
+            
+            fjx3             = _mm_macc_pd(dx13,fscal,fjx3);
+            fjy3             = _mm_macc_pd(dy13,fscal,fjy3);
+            fjz3             = _mm_macc_pd(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx21,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
+            
+            fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx22,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
+            
+            fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_pd(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r23,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx23,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy23,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz23,fscal,fiz2);
+            
+            fjx3             = _mm_macc_pd(dx23,fscal,fjx3);
+            fjy3             = _mm_macc_pd(dy23,fscal,fjy3);
+            fjz3             = _mm_macc_pd(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_pd(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r31,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _mm_macc_pd(dx31,fscal,fix3);
+            fiy3             = _mm_macc_pd(dy31,fscal,fiy3);
+            fiz3             = _mm_macc_pd(dz31,fscal,fiz3);
+            
+            fjx1             = _mm_macc_pd(dx31,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy31,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_pd(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r32,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _mm_macc_pd(dx32,fscal,fix3);
+            fiy3             = _mm_macc_pd(dy32,fscal,fiy3);
+            fiz3             = _mm_macc_pd(dz32,fscal,fiz3);
+            
+            fjx2             = _mm_macc_pd(dx32,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy32,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_pd(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r33,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _mm_macc_pd(dx33,fscal,fix3);
+            fiy3             = _mm_macc_pd(dy33,fscal,fiy3);
+            fiz3             = _mm_macc_pd(dz33,fscal,fiz3);
+            
+            fjx3             = _mm_macc_pd(dx33,fscal,fjx3);
+            fjy3             = _mm_macc_pd(dy33,fscal,fjy3);
+            fjz3             = _mm_macc_pd(dz33,fscal,fjz3);
+
+            gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 449 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx13             = _mm_sub_pd(ix1,jx3);
+            dy13             = _mm_sub_pd(iy1,jy3);
+            dz13             = _mm_sub_pd(iz1,jz3);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+            dx23             = _mm_sub_pd(ix2,jx3);
+            dy23             = _mm_sub_pd(iy2,jy3);
+            dz23             = _mm_sub_pd(iz2,jz3);
+            dx31             = _mm_sub_pd(ix3,jx1);
+            dy31             = _mm_sub_pd(iy3,jy1);
+            dz31             = _mm_sub_pd(iz3,jz1);
+            dx32             = _mm_sub_pd(ix3,jx2);
+            dy32             = _mm_sub_pd(iy3,jy2);
+            dz32             = _mm_sub_pd(iz3,jz2);
+            dx33             = _mm_sub_pd(ix3,jx3);
+            dy33             = _mm_sub_pd(iy3,jy3);
+            dz33             = _mm_sub_pd(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv13           = gmx_mm_invsqrt_pd(rsq13);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+            rinv23           = gmx_mm_invsqrt_pd(rsq23);
+            rinv31           = gmx_mm_invsqrt_pd(rsq31);
+            rinv32           = gmx_mm_invsqrt_pd(rsq32);
+            rinv33           = gmx_mm_invsqrt_pd(rsq33);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq13         = _mm_mul_pd(rinv13,rinv13);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+            rinvsq23         = _mm_mul_pd(rinv23,rinv23);
+            rinvsq31         = _mm_mul_pd(rinv31,rinv31);
+            rinvsq32         = _mm_mul_pd(rinv32,rinv32);
+            rinvsq33         = _mm_mul_pd(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+            fjx3             = _mm_setzero_pd();
+            fjy3             = _mm_setzero_pd();
+            fjz3             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_macc_pd(-c6grid_00,_mm_sub_pd(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_msub_pd(vvdw12,one_twelfth,_mm_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_add_pd(vvdw12,_mm_msub_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx11,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
+            
+            fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx12,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
+            
+            fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_pd(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r13,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx13,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy13,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz13,fscal,fiz1);
+            
+            fjx3             = _mm_macc_pd(dx13,fscal,fjx3);
+            fjy3             = _mm_macc_pd(dy13,fscal,fjy3);
+            fjz3             = _mm_macc_pd(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx21,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
+            
+            fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx22,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
+            
+            fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_pd(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r23,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx23,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy23,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz23,fscal,fiz2);
+            
+            fjx3             = _mm_macc_pd(dx23,fscal,fjx3);
+            fjy3             = _mm_macc_pd(dy23,fscal,fjy3);
+            fjz3             = _mm_macc_pd(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_pd(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r31,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix3             = _mm_macc_pd(dx31,fscal,fix3);
+            fiy3             = _mm_macc_pd(dy31,fscal,fiy3);
+            fiz3             = _mm_macc_pd(dz31,fscal,fiz3);
+            
+            fjx1             = _mm_macc_pd(dx31,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy31,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_pd(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r32,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix3             = _mm_macc_pd(dx32,fscal,fix3);
+            fiy3             = _mm_macc_pd(dy32,fscal,fiy3);
+            fiz3             = _mm_macc_pd(dz32,fscal,fiz3);
+            
+            fjx2             = _mm_macc_pd(dx32,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy32,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_pd(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r33,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabD,ewtabF);
+            velec            = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
+            velec            = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix3             = _mm_macc_pd(dx33,fscal,fix3);
+            fiy3             = _mm_macc_pd(dy33,fscal,fiy3);
+            fiz3             = _mm_macc_pd(dz33,fscal,fiz3);
+            
+            fjx3             = _mm_macc_pd(dx33,fscal,fjx3);
+            fjy3             = _mm_macc_pd(dy33,fscal,fjy3);
+            fjz3             = _mm_macc_pd(dz33,fscal,fjz3);
+
+            gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 449 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*449);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_avx_128_fma_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_avx_128_fma_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_11;
+    __m128d           c6grid_12;
+    __m128d           c6grid_13;
+    __m128d           c6grid_21;
+    __m128d           c6grid_22;
+    __m128d           c6grid_23;
+    __m128d           c6grid_31;
+    __m128d           c6grid_32;
+    __m128d           c6grid_33;
+    real             *vdwgridparam;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128d           one_half  = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_pd(charge[inr+1]);
+    jq2              = _mm_set1_pd(charge[inr+2]);
+    jq3              = _mm_set1_pd(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_pd(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq11             = _mm_mul_pd(iq1,jq1);
+    qq12             = _mm_mul_pd(iq1,jq2);
+    qq13             = _mm_mul_pd(iq1,jq3);
+    qq21             = _mm_mul_pd(iq2,jq1);
+    qq22             = _mm_mul_pd(iq2,jq2);
+    qq23             = _mm_mul_pd(iq2,jq3);
+    qq31             = _mm_mul_pd(iq3,jq1);
+    qq32             = _mm_mul_pd(iq3,jq2);
+    qq33             = _mm_mul_pd(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+        fix3             = _mm_setzero_pd();
+        fiy3             = _mm_setzero_pd();
+        fiz3             = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx13             = _mm_sub_pd(ix1,jx3);
+            dy13             = _mm_sub_pd(iy1,jy3);
+            dz13             = _mm_sub_pd(iz1,jz3);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+            dx23             = _mm_sub_pd(ix2,jx3);
+            dy23             = _mm_sub_pd(iy2,jy3);
+            dz23             = _mm_sub_pd(iz2,jz3);
+            dx31             = _mm_sub_pd(ix3,jx1);
+            dy31             = _mm_sub_pd(iy3,jy1);
+            dz31             = _mm_sub_pd(iz3,jz1);
+            dx32             = _mm_sub_pd(ix3,jx2);
+            dy32             = _mm_sub_pd(iy3,jy2);
+            dz32             = _mm_sub_pd(iz3,jz2);
+            dx33             = _mm_sub_pd(ix3,jx3);
+            dy33             = _mm_sub_pd(iy3,jy3);
+            dz33             = _mm_sub_pd(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv13           = gmx_mm_invsqrt_pd(rsq13);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+            rinv23           = gmx_mm_invsqrt_pd(rsq23);
+            rinv31           = gmx_mm_invsqrt_pd(rsq31);
+            rinv32           = gmx_mm_invsqrt_pd(rsq32);
+            rinv33           = gmx_mm_invsqrt_pd(rsq33);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq13         = _mm_mul_pd(rinv13,rinv13);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+            rinvsq23         = _mm_mul_pd(rinv23,rinv23);
+            rinvsq31         = _mm_mul_pd(rinv31,rinv31);
+            rinvsq32         = _mm_mul_pd(rinv32,rinv32);
+            rinvsq33         = _mm_mul_pd(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+            fjx3             = _mm_setzero_pd();
+            fjy3             = _mm_setzero_pd();
+            fjz3             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_macc_pd(_mm_msub_pd(c12_00,rinvsix,_mm_sub_pd(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx11,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
+            
+            fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx12,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
+            
+            fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_pd(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r13,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx13,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy13,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz13,fscal,fiz1);
+            
+            fjx3             = _mm_macc_pd(dx13,fscal,fjx3);
+            fjy3             = _mm_macc_pd(dy13,fscal,fjy3);
+            fjz3             = _mm_macc_pd(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx21,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
+            
+            fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx22,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
+            
+            fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_pd(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r23,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx23,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy23,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz23,fscal,fiz2);
+            
+            fjx3             = _mm_macc_pd(dx23,fscal,fjx3);
+            fjy3             = _mm_macc_pd(dy23,fscal,fjy3);
+            fjz3             = _mm_macc_pd(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_pd(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r31,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _mm_macc_pd(dx31,fscal,fix3);
+            fiy3             = _mm_macc_pd(dy31,fscal,fiy3);
+            fiz3             = _mm_macc_pd(dz31,fscal,fiz3);
+            
+            fjx1             = _mm_macc_pd(dx31,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy31,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_pd(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r32,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _mm_macc_pd(dx32,fscal,fix3);
+            fiy3             = _mm_macc_pd(dy32,fscal,fiy3);
+            fiz3             = _mm_macc_pd(dz32,fscal,fiz3);
+            
+            fjx2             = _mm_macc_pd(dx32,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy32,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_pd(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r33,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _mm_macc_pd(dx33,fscal,fix3);
+            fiy3             = _mm_macc_pd(dy33,fscal,fiy3);
+            fiz3             = _mm_macc_pd(dz33,fscal,fiz3);
+            
+            fjx3             = _mm_macc_pd(dx33,fscal,fjx3);
+            fjy3             = _mm_macc_pd(dy33,fscal,fjy3);
+            fjz3             = _mm_macc_pd(dz33,fscal,fjz3);
+
+            gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 401 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx13             = _mm_sub_pd(ix1,jx3);
+            dy13             = _mm_sub_pd(iy1,jy3);
+            dz13             = _mm_sub_pd(iz1,jz3);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+            dx23             = _mm_sub_pd(ix2,jx3);
+            dy23             = _mm_sub_pd(iy2,jy3);
+            dz23             = _mm_sub_pd(iz2,jz3);
+            dx31             = _mm_sub_pd(ix3,jx1);
+            dy31             = _mm_sub_pd(iy3,jy1);
+            dz31             = _mm_sub_pd(iz3,jz1);
+            dx32             = _mm_sub_pd(ix3,jx2);
+            dy32             = _mm_sub_pd(iy3,jy2);
+            dz32             = _mm_sub_pd(iz3,jz2);
+            dx33             = _mm_sub_pd(ix3,jx3);
+            dy33             = _mm_sub_pd(iy3,jy3);
+            dz33             = _mm_sub_pd(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv13           = gmx_mm_invsqrt_pd(rsq13);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+            rinv23           = gmx_mm_invsqrt_pd(rsq23);
+            rinv31           = gmx_mm_invsqrt_pd(rsq31);
+            rinv32           = gmx_mm_invsqrt_pd(rsq32);
+            rinv33           = gmx_mm_invsqrt_pd(rsq33);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq13         = _mm_mul_pd(rinv13,rinv13);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+            rinvsq23         = _mm_mul_pd(rinv23,rinv23);
+            rinvsq31         = _mm_mul_pd(rinv31,rinv31);
+            rinvsq32         = _mm_mul_pd(rinv32,rinv32);
+            rinvsq33         = _mm_mul_pd(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+            fjx3             = _mm_setzero_pd();
+            fjy3             = _mm_setzero_pd();
+            fjz3             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_macc_pd(_mm_msub_pd(c12_00,rinvsix,_mm_sub_pd(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx11,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
+            
+            fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx12,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
+            
+            fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_pd(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r13,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix1             = _mm_macc_pd(dx13,fscal,fix1);
+            fiy1             = _mm_macc_pd(dy13,fscal,fiy1);
+            fiz1             = _mm_macc_pd(dz13,fscal,fiz1);
+            
+            fjx3             = _mm_macc_pd(dx13,fscal,fjx3);
+            fjy3             = _mm_macc_pd(dy13,fscal,fjy3);
+            fjz3             = _mm_macc_pd(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx21,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
+            
+            fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx22,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
+            
+            fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_pd(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r23,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix2             = _mm_macc_pd(dx23,fscal,fix2);
+            fiy2             = _mm_macc_pd(dy23,fscal,fiy2);
+            fiz2             = _mm_macc_pd(dz23,fscal,fiz2);
+            
+            fjx3             = _mm_macc_pd(dx23,fscal,fjx3);
+            fjy3             = _mm_macc_pd(dy23,fscal,fjy3);
+            fjz3             = _mm_macc_pd(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_pd(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r31,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix3             = _mm_macc_pd(dx31,fscal,fix3);
+            fiy3             = _mm_macc_pd(dy31,fscal,fiy3);
+            fiz3             = _mm_macc_pd(dz31,fscal,fiz3);
+            
+            fjx1             = _mm_macc_pd(dx31,fscal,fjx1);
+            fjy1             = _mm_macc_pd(dy31,fscal,fjy1);
+            fjz1             = _mm_macc_pd(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_pd(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r32,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix3             = _mm_macc_pd(dx32,fscal,fix3);
+            fiy3             = _mm_macc_pd(dy32,fscal,fiy3);
+            fiz3             = _mm_macc_pd(dz32,fscal,fiz3);
+            
+            fjx2             = _mm_macc_pd(dx32,fscal,fjx2);
+            fjy2             = _mm_macc_pd(dy32,fscal,fjy2);
+            fjz2             = _mm_macc_pd(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_pd(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r33,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+#ifdef __XOP__
+            eweps            = _mm_frcz_pd(ewrt);
+#else
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+#endif
+            twoeweps         = _mm_add_pd(eweps,eweps);
+            gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix3             = _mm_macc_pd(dx33,fscal,fix3);
+            fiy3             = _mm_macc_pd(dy33,fscal,fiy3);
+            fiz3             = _mm_macc_pd(dz33,fscal,fiz3);
+            
+            fjx3             = _mm_macc_pd(dx33,fscal,fjx3);
+            fjy3             = _mm_macc_pd(dy33,fscal,fjy3);
+            fjz3             = _mm_macc_pd(dz33,fscal,fjz3);
+
+            gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 401 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*401);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_avx_128_fma_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_avx_128_fma_double.c
new file mode 100644 (file)
index 0000000..fd4c323
--- /dev/null
@@ -0,0 +1,631 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_128_fma_double.h"
+#include "kernelutil_x86_avx_128_fma_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_avx_128_fma_double
+ * Electrostatics interaction: None
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_avx_128_fma_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    real             *vdwgridparam;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128d           one_half  = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    rcutoff_scalar   = fr->rvdw;
+    rcutoff          = _mm_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        vvdwsum          = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_macc_pd(-c6grid_00,_mm_sub_pd(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_msub_pd(_mm_nmacc_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                              _mm_mul_pd(_mm_sub_pd(vvdw6,_mm_macc_pd(c6grid_00,sh_lj_ewald,_mm_mul_pd(c6_00,sh_vdw_invrcut6))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_add_pd(vvdw12,_mm_msub_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_pd(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   _mm_mul_pd(dx00,fscal),
+                                                   _mm_mul_pd(dy00,fscal),
+                                                   _mm_mul_pd(dz00,fscal));
+
+            }
+
+            /* Inner loop uses 58 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_macc_pd(-c6grid_00,_mm_sub_pd(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_msub_pd(_mm_nmacc_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                              _mm_mul_pd(_mm_sub_pd(vvdw6,_mm_macc_pd(c6grid_00,sh_lj_ewald,_mm_mul_pd(c6_00,sh_vdw_invrcut6))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_add_pd(vvdw12,_mm_msub_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_pd(vvdw,cutoff_mask);
+            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,
+                                                   _mm_mul_pd(dx00,fscal),
+                                                   _mm_mul_pd(dy00,fscal),
+                                                   _mm_mul_pd(dz00,fscal));
+
+            }
+
+            /* Inner loop uses 58 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*58);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_avx_128_fma_double
+ * Electrostatics interaction: None
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_avx_128_fma_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    real             *vdwgridparam;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128d           one_half  = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    rcutoff_scalar   = fr->rvdw;
+    rcutoff          = _mm_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_macc_pd(_mm_msub_pd(c12_00,rinvsix,_mm_sub_pd(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   _mm_mul_pd(dx00,fscal),
+                                                   _mm_mul_pd(dy00,fscal),
+                                                   _mm_mul_pd(dz00,fscal));
+
+            }
+
+            /* Inner loop uses 50 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_macc_pd(_mm_msub_pd(c12_00,rinvsix,_mm_sub_pd(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,
+                                                   _mm_mul_pd(dx00,fscal),
+                                                   _mm_mul_pd(dy00,fscal),
+                                                   _mm_mul_pd(dz00,fscal));
+
+            }
+
+            /* Inner loop uses 50 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 6 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*6 + inneriter*50);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecNone_VdwLJEw_GeomP1P1_avx_128_fma_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/nb_kernel_ElecNone_VdwLJEw_GeomP1P1_avx_128_fma_double.c
new file mode 100644 (file)
index 0000000..804b0f4
--- /dev/null
@@ -0,0 +1,577 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_128_fma_double.h"
+#include "kernelutil_x86_avx_128_fma_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_avx_128_fma_double
+ * Electrostatics interaction: None
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_avx_128_fma_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    real             *vdwgridparam;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128d           one_half  = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        vvdwsum          = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_macc_pd(-c6grid_00,_mm_sub_pd(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_msub_pd(vvdw12,one_twelfth,_mm_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_add_pd(vvdw12,_mm_msub_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   _mm_mul_pd(dx00,fscal),
+                                                   _mm_mul_pd(dy00,fscal),
+                                                   _mm_mul_pd(dz00,fscal));
+
+            /* Inner loop uses 50 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_macc_pd(-c6grid_00,_mm_sub_pd(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_msub_pd(vvdw12,one_twelfth,_mm_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_add_pd(vvdw12,_mm_msub_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,
+                                                   _mm_mul_pd(dx00,fscal),
+                                                   _mm_mul_pd(dy00,fscal),
+                                                   _mm_mul_pd(dz00,fscal));
+
+            /* Inner loop uses 50 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*50);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_avx_128_fma_double
+ * Electrostatics interaction: None
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_avx_128_fma_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    real             *vdwgridparam;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128d           one_half  = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_macc_pd(_mm_msub_pd(c12_00,rinvsix,_mm_sub_pd(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,
+                                                   _mm_mul_pd(dx00,fscal),
+                                                   _mm_mul_pd(dy00,fscal),
+                                                   _mm_mul_pd(dz00,fscal));
+
+            /* Inner loop uses 47 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_macc_pd(_mm_msub_pd(c12_00,rinvsix,_mm_sub_pd(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Update vectorial force */
+            fix0             = _mm_macc_pd(dx00,fscal,fix0);
+            fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
+            
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,
+                                                   _mm_mul_pd(dx00,fscal),
+                                                   _mm_mul_pd(dy00,fscal),
+                                                   _mm_mul_pd(dz00,fscal));
+
+            /* Inner loop uses 47 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 6 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*6 + inneriter*47);
+}
index 4af5af13042ad78b27e4253c4757eaf85b48c91c..9ad88affd3bf990302d190cf23640c389ce37d53 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
 
 #include "../nb_kernel.h"
 
+nb_kernel_t nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_avx_128_fma_double;
+nb_kernel_t nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_avx_128_fma_double;
+nb_kernel_t nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_avx_128_fma_double;
+nb_kernel_t nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_avx_128_fma_double;
 nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_avx_128_fma_double;
 nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_avx_128_fma_double;
 nb_kernel_t nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_avx_128_fma_double;
@@ -48,6 +52,16 @@ nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_avx_128_fma_double;
 nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_avx_128_fma_double;
 nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_avx_128_fma_double;
 nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_avx_128_fma_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_avx_128_fma_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_avx_128_fma_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_avx_128_fma_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_avx_128_fma_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_avx_128_fma_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_avx_128_fma_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_avx_128_fma_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_avx_128_fma_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_avx_128_fma_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_avx_128_fma_double;
 nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_avx_128_fma_double;
 nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_avx_128_fma_double;
 nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_avx_128_fma_double;
@@ -78,6 +92,16 @@ nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_avx_128_fma_double;
 nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_avx_128_fma_double;
 nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_avx_128_fma_double;
 nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_avx_128_fma_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_avx_128_fma_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_avx_128_fma_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_avx_128_fma_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_avx_128_fma_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_avx_128_fma_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_avx_128_fma_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_avx_128_fma_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_avx_128_fma_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_avx_128_fma_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_avx_128_fma_double;
 nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_avx_128_fma_double;
 nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_avx_128_fma_double;
 nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_avx_128_fma_double;
@@ -259,6 +283,10 @@ nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_avx_128_fma_double;
 nb_kernel_info_t
     kernellist_avx_128_fma_double[] =
 {
+    { nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_avx_128_fma_double, "nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_avx_128_fma_double", "avx_128_fma_double", "None", "None", "LJEwald", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_avx_128_fma_double, "nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_avx_128_fma_double", "avx_128_fma_double", "None", "None", "LJEwald", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_avx_128_fma_double, "nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_avx_128_fma_double", "avx_128_fma_double", "None", "None", "LJEwald", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_avx_128_fma_double, "nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_avx_128_fma_double", "avx_128_fma_double", "None", "None", "LJEwald", "PotentialShift", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_avx_128_fma_double, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_avx_128_fma_double", "avx_128_fma_double", "None", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_avx_128_fma_double, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_avx_128_fma_double", "avx_128_fma_double", "None", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_avx_128_fma_double, "nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_avx_128_fma_double", "avx_128_fma_double", "None", "None", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
@@ -267,6 +295,16 @@ nb_kernel_info_t
     { nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_avx_128_fma_double, "nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_avx_128_fma_double", "avx_128_fma_double", "None", "None", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_avx_128_fma_double, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_avx_128_fma_double", "avx_128_fma_double", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_avx_128_fma_double, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_avx_128_fma_double", "avx_128_fma_double", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_avx_128_fma_double, "nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "LJEwald", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_avx_128_fma_double, "nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "LJEwald", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_avx_128_fma_double, "nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "LJEwald", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_avx_128_fma_double, "nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "LJEwald", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_avx_128_fma_double, "nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "LJEwald", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_avx_128_fma_double, "nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "LJEwald", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_avx_128_fma_double, "nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "LJEwald", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_avx_128_fma_double, "nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "LJEwald", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_avx_128_fma_double, "nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "LJEwald", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_avx_128_fma_double, "nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "LJEwald", "None", "Water4Water4", "", "Force" },
     { nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_avx_128_fma_double, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_avx_128_fma_double, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_avx_128_fma_double, "nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
@@ -297,6 +335,16 @@ nb_kernel_info_t
     { nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_avx_128_fma_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
     { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_avx_128_fma_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
     { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_avx_128_fma_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_avx_128_fma_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_avx_128_fma_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_avx_128_fma_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_avx_128_fma_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_avx_128_fma_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_avx_128_fma_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_avx_128_fma_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_avx_128_fma_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_avx_128_fma_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_avx_128_fma_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water4Water4", "", "Force" },
     { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_avx_128_fma_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_avx_128_fma_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_avx_128_fma_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_avx_128_fma_double", "avx_128_fma_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
index 134d134346aecfbaff1e65b3e0c277e4e829f38d..55c229aa90095374da4c52a60bf73fc0d6dc202a 100644 (file)
@@ -2,7 +2,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -151,6 +151,15 @@ void
     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
     real             *vftab;
     /* #endif */
+    /* #if 'LJEwald' in KERNEL_VDW */
+    /* #for I,J in PAIRS_IJ */
+    __m128d           c6grid_{I}{J};
+    /* #endfor */
+    real             *vdwgridparam;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128d           one_half  = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    /* #endif */
     /* #if 'Ewald' in KERNEL_ELEC */
     __m128i          ewitab;
     __m128d          ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
@@ -189,6 +198,12 @@ void
     vdwparam         = fr->nbfp;
     vdwtype          = mdatoms->typeA;
     /* #endif */
+    /* #if 'LJEwald' in KERNEL_VDW */
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+    /* #endif */
 
     /* #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW */
     vftab            = kernel_data->table_elec_vdw->data;
@@ -245,8 +260,14 @@ void
     qq{I}{J}             = _mm_mul_pd(iq{I},jq{J});
     /*         #endif */
     /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
+    /*             #if 'LJEwald' in KERNEL_VDW */
+    c6_{I}{J}            = _mm_set1_pd(vdwparam[vdwioffset{I}+vdwjidx{J}A]);
+    c12_{I}{J}           = _mm_set1_pd(vdwparam[vdwioffset{I}+vdwjidx{J}A+1]);
+    c6grid_{I}{J}        = _mm_set1_pd(vdwgridparam[vdwioffset{I}+vdwjidx{J}A]);
+    /*             #else */
     c6_{I}{J}            = _mm_set1_pd(vdwparam[vdwioffset{I}+vdwjidx{J}A]);
     c12_{I}{J}           = _mm_set1_pd(vdwparam[vdwioffset{I}+vdwjidx{J}A+1]);
+    /*             #endif */
     /*         #endif */
     /*     #endfor */
     /* #endif */
@@ -528,8 +549,15 @@ void
             /*             #if ROUND == 'Loop' */
             gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset{I}+vdwjidx{J}A,
                                          vdwparam+vdwioffset{I}+vdwjidx{J}B,&c6_{I}{J},&c12_{I}{J});
+            /*                 #if 'LJEwald' in KERNEL_VDW */
+            c6grid_{I}{J}       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset{I}+vdwjidx{J}A,
+                                                               vdwgridparam+vdwioffset{I}+vdwjidx{J}B);
+            /*                 #endif */
             /*             #else */
             gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset{I}+vdwjidx{J}A,&c6_{I}{J},&c12_{I}{J});
+            /*                 #if 'LJEwald' in KERNEL_VDW */
+            c6grid_{I}{J}       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset{I}+vdwjidx{J}A);
+            /*                 #endif */
             /*             #endif */
             /*         #endif */
             /*     #endif */
@@ -834,6 +862,45 @@ void
             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv{I}{J})));
             /*                 #define INNERFLOPS INNERFLOPS+4 */
             /*             #endif */
+
+            /*         #elif KERNEL_VDW=='LJEwald' */
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq{I}{J},rinvsq{I}{J}),rinvsq{I}{J});
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq{I}{J});
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
+            /*                 #define INNERFLOPS INNERFLOPS+10 */
+            /*             #if 'Potential' in KERNEL_VF or KERNEL_MOD_VDW=='PotentialSwitch' */
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_macc_pd(-c6grid_{I}{J},_mm_sub_pd(one,poly),c6_{I}{J}),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_{I}{J},_mm_mul_pd(rinvsix,rinvsix));
+           /*                     #define INNERFLOPS INNERFLOPS+5 */
+            /*                 #if KERNEL_MOD_VDW=='PotentialShift' */
+            vvdw             = _mm_msub_pd(_mm_nmacc_pd(c12_{I}{J},_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                              _mm_mul_pd(_mm_sub_pd(vvdw6,_mm_macc_pd(c6grid_{I}{J},sh_lj_ewald,_mm_mul_pd(c6_{I}{J},sh_vdw_invrcut6))),one_sixth));
+            /*                     #define INNERFLOPS INNERFLOPS+6 */
+            /*                 #else */
+            vvdw             = _mm_msub_pd(vvdw12,one_twelfth,_mm_mul_pd(vvdw6,one_sixth));
+           /*                     #define INNERFLOPS INNERFLOPS+2 */
+            /*                 #endif */
+            /*                  ## Check for force inside potential check, i.e. this means we already did the potential part */
+            /*                  #if 'Force' in KERNEL_VF */
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_add_pd(vvdw12,_mm_msub_pd(_mm_mul_pd(c6grid_{I}{J},one_sixth),_mm_mul_pd(exponent,ewclj6),vvdw6)),rinvsq{I}{J});
+            /*                 #define INNERFLOPS INNERFLOPS+5 */
+            /*                  #endif */
+            /*              #elif KERNEL_VF=='Force' */
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_{I}{J},_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_{I}{J},one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_macc_pd(_mm_msub_pd(c12_{I}{J},rinvsix,_mm_sub_pd(c6_{I}{J},f6A)),rinvsix,f6B),rinvsq{I}{J});
+            /*                 #define INNERFLOPS INNERFLOPS+10 */
+            /*              #endif */
             /*         #endif */
             /*         ## End of check for vdw interaction forms */
             /*     #endif */
index 4b171986468b5d0f90bef86bdb73f7d227e2641c..7a3f21b02dd6446d9740278a76a17b1bd5aacca8 100755 (executable)
@@ -2,7 +2,7 @@
 #
 # This file is part of the GROMACS molecular simulation package.
 #
-# Copyright (c) 2012,2013, by the GROMACS development team, led by
+# Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
 # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 # and including many others, as listed in the AUTHORS file in the
 # top-level source directory and at http://www.gromacs.org.
@@ -120,6 +120,7 @@ VdwList = {
     'LennardJones'            : ['rinvsq'],
 #    'Buckingham'              : ['rinv','rinvsq','r'], # Disabled for sse4.1 to reduce number of kernels and simply the template
     'CubicSplineTable'        : ['rinv','r','table'],
+    'LJEwald'                 : ['rinv','rinvsq','r'],
 }
 
 
@@ -193,6 +194,7 @@ Abbreviation = {
     'CubicSplineTable'        : 'CSTab',
     'LennardJones'            : 'LJ',
     'Buckingham'              : 'Bham',
+    'LJEwald'                 : 'LJEw',
     'PotentialShift'          : 'Sh',
     'PotentialSwitch'         : 'Sw',
     'ExactCutoff'             : 'Cut',
@@ -282,7 +284,11 @@ def KeepKernel(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,Kernel
         return 0
     # For Vdw, we support switch and shift for Lennard-Jones/Buckingham
     if((KernelVdwMod=='ExactCutoff') or
-       (KernelVdwMod in ['PotentialShift','PotentialSwitch'] and KernelVdw not in ['LennardJones','Buckingham'])):
+       (KernelVdwMod in ['PotentialShift','PotentialSwitch'] and KernelVdw not in ['LennardJones','Buckingham','LJEwald'])):
+        return 0
+
+    # For LJEwald, we only support shift
+    if(KernelVdw=='LJEwald' and KernelVdwMod=='PotentialSwitch'):
         return 0
 
     # Choose either switch or shift and don't mix them...
@@ -302,6 +308,10 @@ def KeepKernel(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,Kernel
         elif(KernelElec!='ReactionField'):
             return 0
 
+    #Only do LJ-PME if we are also doing PME for electrostatics, or no electrostatics at all.                             
+    if(KernelVdw=='LJEwald' and KernelElec not in ['Ewald','None']):
+        return 0
+
     return 1
 
 
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_avx_128_fma_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_avx_128_fma_single.c
new file mode 100644 (file)
index 0000000..b7c011a
--- /dev/null
@@ -0,0 +1,848 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_128_fma_single.h"
+#include "kernelutil_x86_avx_128_fma_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_avx_128_fma_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_avx_128_fma_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    real             *vdwgridparam;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
+    beta2            = _mm_mul_ps(beta,beta);
+    beta3            = _mm_mul_ps(beta,beta2);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq00);
+            rinv3            = _mm_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq00,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv00,sh_ewald));
+            velec            = _mm_mul_ps(qq00,velec);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+           vvdw6            = _mm_mul_ps(_mm_macc_ps(-c6grid_00,_mm_sub_ps(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_msub_ps(_mm_nmacc_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                          _mm_mul_ps(_mm_sub_ps(vvdw6,_mm_macc_ps(c6grid_00,sh_lj_ewald,_mm_mul_ps(c6_00,sh_vdw_invrcut6))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _mm_mul_ps(_mm_add_ps(vvdw12,_mm_msub_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   _mm_mul_ps(dx00,fscal),
+                                                   _mm_mul_ps(dy00,fscal),
+                                                   _mm_mul_ps(dz00,fscal));
+
+            }
+
+            /* Inner loop uses 63 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq00);
+            rinv3            = _mm_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq00,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv00,sh_ewald));
+            velec            = _mm_mul_ps(qq00,velec);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+           vvdw6            = _mm_mul_ps(_mm_macc_ps(-c6grid_00,_mm_sub_ps(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_msub_ps(_mm_nmacc_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                          _mm_mul_ps(_mm_sub_ps(vvdw6,_mm_macc_ps(c6grid_00,sh_lj_ewald,_mm_mul_ps(c6_00,sh_vdw_invrcut6))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _mm_mul_ps(_mm_add_ps(vvdw12,_mm_msub_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   _mm_mul_ps(dx00,fscal),
+                                                   _mm_mul_ps(dy00,fscal),
+                                                   _mm_mul_ps(dz00,fscal));
+
+            }
+
+            /* Inner loop uses 64 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*64);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_avx_128_fma_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_avx_128_fma_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    real             *vdwgridparam;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
+    beta2            = _mm_mul_ps(beta,beta);
+    beta3            = _mm_mul_ps(beta,beta2);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq00);
+            rinv3            = _mm_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq00,felec);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_macc_ps(_mm_msub_ps(c12_00,rinvsix,_mm_sub_ps(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   _mm_mul_ps(dx00,fscal),
+                                                   _mm_mul_ps(dy00,fscal),
+                                                   _mm_mul_ps(dz00,fscal));
+
+            }
+
+            /* Inner loop uses 52 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq00);
+            rinv3            = _mm_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq00,felec);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_macc_ps(_mm_msub_ps(c12_00,rinvsix,_mm_sub_ps(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   _mm_mul_ps(dx00,fscal),
+                                                   _mm_mul_ps(dy00,fscal),
+                                                   _mm_mul_ps(dz00,fscal));
+
+            }
+
+            /* Inner loop uses 53 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*53);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_avx_128_fma_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_avx_128_fma_single.c
new file mode 100644 (file)
index 0000000..c699fc1
--- /dev/null
@@ -0,0 +1,1302 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_128_fma_single.h"
+#include "kernelutil_x86_avx_128_fma_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_avx_128_fma_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_avx_128_fma_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_10;
+    __m128           c6grid_20;
+    real             *vdwgridparam;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
+    beta2            = _mm_mul_ps(beta,beta);
+    beta3            = _mm_mul_ps(beta,beta2);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq00);
+            rinv3            = _mm_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq00,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv00,sh_ewald));
+            velec            = _mm_mul_ps(qq00,velec);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+           vvdw6            = _mm_mul_ps(_mm_macc_ps(-c6grid_00,_mm_sub_ps(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_msub_ps(_mm_nmacc_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                          _mm_mul_ps(_mm_sub_ps(vvdw6,_mm_macc_ps(c6grid_00,sh_lj_ewald,_mm_mul_ps(c6_00,sh_vdw_invrcut6))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _mm_mul_ps(_mm_add_ps(vvdw12,_mm_msub_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq10);
+            rinv3            = _mm_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq10,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv10,sh_ewald));
+            velec            = _mm_mul_ps(qq10,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx10,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
+
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq20);
+            rinv3            = _mm_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq20,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv20,sh_ewald));
+            velec            = _mm_mul_ps(qq20,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx20,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
+
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 129 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq00);
+            rinv3            = _mm_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq00,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv00,sh_ewald));
+            velec            = _mm_mul_ps(qq00,velec);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+           vvdw6            = _mm_mul_ps(_mm_macc_ps(-c6grid_00,_mm_sub_ps(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_msub_ps(_mm_nmacc_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                          _mm_mul_ps(_mm_sub_ps(vvdw6,_mm_macc_ps(c6grid_00,sh_lj_ewald,_mm_mul_ps(c6_00,sh_vdw_invrcut6))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _mm_mul_ps(_mm_add_ps(vvdw12,_mm_msub_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq10);
+            rinv3            = _mm_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq10,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv10,sh_ewald));
+            velec            = _mm_mul_ps(qq10,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx10,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
+
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq20);
+            rinv3            = _mm_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq20,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv20,sh_ewald));
+            velec            = _mm_mul_ps(qq20,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx20,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
+
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 132 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*132);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_avx_128_fma_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_avx_128_fma_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_10;
+    __m128           c6grid_20;
+    real             *vdwgridparam;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
+    beta2            = _mm_mul_ps(beta,beta);
+    beta3            = _mm_mul_ps(beta,beta2);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq00);
+            rinv3            = _mm_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq00,felec);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_macc_ps(_mm_msub_ps(c12_00,rinvsix,_mm_sub_ps(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq10);
+            rinv3            = _mm_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq10,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx10,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
+
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq20);
+            rinv3            = _mm_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq20,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx20,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
+
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 114 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq00);
+            rinv3            = _mm_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq00,felec);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_macc_ps(_mm_msub_ps(c12_00,rinvsix,_mm_sub_ps(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq10);
+            rinv3            = _mm_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq10,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx10,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
+
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq20);
+            rinv3            = _mm_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq20,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx20,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
+
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 117 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*117);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_avx_128_fma_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_avx_128_fma_single.c
new file mode 100644 (file)
index 0000000..aa85bb3
--- /dev/null
@@ -0,0 +1,2398 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_128_fma_single.h"
+#include "kernelutil_x86_avx_128_fma_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_avx_128_fma_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_avx_128_fma_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_01;
+    __m128           c6grid_02;
+    __m128           c6grid_10;
+    __m128           c6grid_11;
+    __m128           c6grid_12;
+    __m128           c6grid_20;
+    __m128           c6grid_21;
+    __m128           c6grid_22;
+    real             *vdwgridparam;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
+    beta2            = _mm_mul_ps(beta,beta);
+    beta3            = _mm_mul_ps(beta,beta2);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_ps(iq0,jq0);
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq00);
+            rinv3            = _mm_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq00,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv00,sh_ewald));
+            velec            = _mm_mul_ps(qq00,velec);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+           vvdw6            = _mm_mul_ps(_mm_macc_ps(-c6grid_00,_mm_sub_ps(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_msub_ps(_mm_nmacc_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                          _mm_mul_ps(_mm_sub_ps(vvdw6,_mm_macc_ps(c6grid_00,sh_lj_ewald,_mm_mul_ps(c6_00,sh_vdw_invrcut6))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _mm_mul_ps(_mm_add_ps(vvdw12,_mm_msub_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq01);
+            rinv3            = _mm_mul_ps(rinvsq01,rinv01);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq01,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv01,sh_ewald));
+            velec            = _mm_mul_ps(qq01,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx01,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
+
+            fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq02);
+            rinv3            = _mm_mul_ps(rinvsq02,rinv02);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq02,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv02,sh_ewald));
+            velec            = _mm_mul_ps(qq02,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx02,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
+
+            fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq10);
+            rinv3            = _mm_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq10,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv10,sh_ewald));
+            velec            = _mm_mul_ps(qq10,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx10,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
+
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq11);
+            rinv3            = _mm_mul_ps(rinvsq11,rinv11);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq11,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv11,sh_ewald));
+            velec            = _mm_mul_ps(qq11,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx11,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
+
+            fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq12);
+            rinv3            = _mm_mul_ps(rinvsq12,rinv12);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq12,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv12,sh_ewald));
+            velec            = _mm_mul_ps(qq12,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx12,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
+
+            fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq20);
+            rinv3            = _mm_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq20,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv20,sh_ewald));
+            velec            = _mm_mul_ps(qq20,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx20,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
+
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq21);
+            rinv3            = _mm_mul_ps(rinvsq21,rinv21);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq21,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv21,sh_ewald));
+            velec            = _mm_mul_ps(qq21,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx21,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
+
+            fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq22);
+            rinv3            = _mm_mul_ps(rinvsq22,rinv22);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq22,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv22,sh_ewald));
+            velec            = _mm_mul_ps(qq22,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx22,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
+
+            fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
+
+            }
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 327 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq00);
+            rinv3            = _mm_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq00,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv00,sh_ewald));
+            velec            = _mm_mul_ps(qq00,velec);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+           vvdw6            = _mm_mul_ps(_mm_macc_ps(-c6grid_00,_mm_sub_ps(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_msub_ps(_mm_nmacc_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                          _mm_mul_ps(_mm_sub_ps(vvdw6,_mm_macc_ps(c6grid_00,sh_lj_ewald,_mm_mul_ps(c6_00,sh_vdw_invrcut6))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _mm_mul_ps(_mm_add_ps(vvdw12,_mm_msub_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+            r01              = _mm_andnot_ps(dummy_mask,r01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq01);
+            rinv3            = _mm_mul_ps(rinvsq01,rinv01);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq01,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv01,sh_ewald));
+            velec            = _mm_mul_ps(qq01,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx01,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
+
+            fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+            r02              = _mm_andnot_ps(dummy_mask,r02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq02);
+            rinv3            = _mm_mul_ps(rinvsq02,rinv02);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq02,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv02,sh_ewald));
+            velec            = _mm_mul_ps(qq02,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx02,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
+
+            fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq10);
+            rinv3            = _mm_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq10,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv10,sh_ewald));
+            velec            = _mm_mul_ps(qq10,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx10,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
+
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq11);
+            rinv3            = _mm_mul_ps(rinvsq11,rinv11);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq11,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv11,sh_ewald));
+            velec            = _mm_mul_ps(qq11,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx11,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
+
+            fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq12);
+            rinv3            = _mm_mul_ps(rinvsq12,rinv12);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq12,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv12,sh_ewald));
+            velec            = _mm_mul_ps(qq12,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx12,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
+
+            fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq20);
+            rinv3            = _mm_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq20,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv20,sh_ewald));
+            velec            = _mm_mul_ps(qq20,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx20,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
+
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq21);
+            rinv3            = _mm_mul_ps(rinvsq21,rinv21);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq21,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv21,sh_ewald));
+            velec            = _mm_mul_ps(qq21,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx21,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
+
+            fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq22);
+            rinv3            = _mm_mul_ps(rinvsq22,rinv22);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq22,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv22,sh_ewald));
+            velec            = _mm_mul_ps(qq22,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx22,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
+
+            fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
+
+            }
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 336 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*336);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_avx_128_fma_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_avx_128_fma_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_01;
+    __m128           c6grid_02;
+    __m128           c6grid_10;
+    __m128           c6grid_11;
+    __m128           c6grid_12;
+    __m128           c6grid_20;
+    __m128           c6grid_21;
+    __m128           c6grid_22;
+    real             *vdwgridparam;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
+    beta2            = _mm_mul_ps(beta,beta);
+    beta3            = _mm_mul_ps(beta,beta2);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_ps(iq0,jq0);
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq00);
+            rinv3            = _mm_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq00,felec);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_macc_ps(_mm_msub_ps(c12_00,rinvsix,_mm_sub_ps(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq01);
+            rinv3            = _mm_mul_ps(rinvsq01,rinv01);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq01,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx01,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
+
+            fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq02);
+            rinv3            = _mm_mul_ps(rinvsq02,rinv02);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq02,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx02,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
+
+            fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq10);
+            rinv3            = _mm_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq10,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx10,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
+
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq11);
+            rinv3            = _mm_mul_ps(rinvsq11,rinv11);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq11,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx11,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
+
+            fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq12);
+            rinv3            = _mm_mul_ps(rinvsq12,rinv12);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq12,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx12,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
+
+            fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq20);
+            rinv3            = _mm_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq20,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx20,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
+
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq21);
+            rinv3            = _mm_mul_ps(rinvsq21,rinv21);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq21,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx21,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
+
+            fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq22);
+            rinv3            = _mm_mul_ps(rinvsq22,rinv22);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq22,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx22,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
+
+            fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
+
+            }
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 300 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq00);
+            rinv3            = _mm_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq00,felec);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_macc_ps(_mm_msub_ps(c12_00,rinvsix,_mm_sub_ps(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+            r01              = _mm_andnot_ps(dummy_mask,r01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq01);
+            rinv3            = _mm_mul_ps(rinvsq01,rinv01);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq01,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx01,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
+
+            fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+            r02              = _mm_andnot_ps(dummy_mask,r02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq02);
+            rinv3            = _mm_mul_ps(rinvsq02,rinv02);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq02,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx02,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
+
+            fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq10);
+            rinv3            = _mm_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq10,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx10,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
+
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq11);
+            rinv3            = _mm_mul_ps(rinvsq11,rinv11);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq11,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx11,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
+
+            fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq12);
+            rinv3            = _mm_mul_ps(rinvsq12,rinv12);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq12,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx12,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
+
+            fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq20);
+            rinv3            = _mm_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq20,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx20,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
+
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq21);
+            rinv3            = _mm_mul_ps(rinvsq21,rinv21);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq21,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx21,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
+
+            fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq22);
+            rinv3            = _mm_mul_ps(rinvsq22,rinv22);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq22,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx22,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
+
+            fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
+
+            }
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 309 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*309);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_avx_128_fma_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_avx_128_fma_single.c
new file mode 100644 (file)
index 0000000..15ceec7
--- /dev/null
@@ -0,0 +1,1462 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_128_fma_single.h"
+#include "kernelutil_x86_avx_128_fma_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_avx_128_fma_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_avx_128_fma_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_10;
+    __m128           c6grid_20;
+    __m128           c6grid_30;
+    real             *vdwgridparam;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
+    beta2            = _mm_mul_ps(beta,beta);
+    beta3            = _mm_mul_ps(beta,beta2);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+           vvdw6            = _mm_mul_ps(_mm_macc_ps(-c6grid_00,_mm_sub_ps(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_msub_ps(_mm_nmacc_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                          _mm_mul_ps(_mm_sub_ps(vvdw6,_mm_macc_ps(c6grid_00,sh_lj_ewald,_mm_mul_ps(c6_00,sh_vdw_invrcut6))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _mm_mul_ps(_mm_add_ps(vvdw12,_mm_msub_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq10);
+            rinv3            = _mm_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq10,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv10,sh_ewald));
+            velec            = _mm_mul_ps(qq10,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx10,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
+
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq20);
+            rinv3            = _mm_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq20,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv20,sh_ewald));
+            velec            = _mm_mul_ps(qq20,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx20,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
+
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq30);
+            rinv3            = _mm_mul_ps(rinvsq30,rinv30);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq30,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv30,sh_ewald));
+            velec            = _mm_mul_ps(qq30,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix3             = _mm_macc_ps(dx30,fscal,fix3);
+            fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
+            fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
+
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
+            }
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 158 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+           vvdw6            = _mm_mul_ps(_mm_macc_ps(-c6grid_00,_mm_sub_ps(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_msub_ps(_mm_nmacc_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                          _mm_mul_ps(_mm_sub_ps(vvdw6,_mm_macc_ps(c6grid_00,sh_lj_ewald,_mm_mul_ps(c6_00,sh_vdw_invrcut6))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _mm_mul_ps(_mm_add_ps(vvdw12,_mm_msub_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq10);
+            rinv3            = _mm_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq10,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv10,sh_ewald));
+            velec            = _mm_mul_ps(qq10,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx10,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
+
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq20);
+            rinv3            = _mm_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq20,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv20,sh_ewald));
+            velec            = _mm_mul_ps(qq20,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx20,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
+
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+            r30              = _mm_andnot_ps(dummy_mask,r30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq30);
+            rinv3            = _mm_mul_ps(rinvsq30,rinv30);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq30,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv30,sh_ewald));
+            velec            = _mm_mul_ps(qq30,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix3             = _mm_macc_ps(dx30,fscal,fix3);
+            fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
+            fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
+
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
+            }
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 162 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*162);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_avx_128_fma_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_avx_128_fma_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_10;
+    __m128           c6grid_20;
+    __m128           c6grid_30;
+    real             *vdwgridparam;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
+    beta2            = _mm_mul_ps(beta,beta);
+    beta3            = _mm_mul_ps(beta,beta2);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_macc_ps(_mm_msub_ps(c12_00,rinvsix,_mm_sub_ps(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq10);
+            rinv3            = _mm_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq10,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx10,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
+
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq20);
+            rinv3            = _mm_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq20,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx20,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
+
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq30);
+            rinv3            = _mm_mul_ps(rinvsq30,rinv30);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq30,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix3             = _mm_macc_ps(dx30,fscal,fix3);
+            fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
+            fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
+
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
+            }
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 143 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_macc_ps(_mm_msub_ps(c12_00,rinvsix,_mm_sub_ps(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq10);
+            rinv3            = _mm_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq10,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx10,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
+
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq20);
+            rinv3            = _mm_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq20,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx20,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
+
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+            r30              = _mm_andnot_ps(dummy_mask,r30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq30);
+            rinv3            = _mm_mul_ps(rinvsq30,rinv30);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq30,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix3             = _mm_macc_ps(dx30,fscal,fix3);
+            fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
+            fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
+
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
+            }
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 147 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*147);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_avx_128_fma_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_avx_128_fma_single.c
new file mode 100644 (file)
index 0000000..078b9f7
--- /dev/null
@@ -0,0 +1,2574 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_128_fma_single.h"
+#include "kernelutil_x86_avx_128_fma_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_avx_128_fma_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_avx_128_fma_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_11;
+    __m128           c6grid_12;
+    __m128           c6grid_13;
+    __m128           c6grid_21;
+    __m128           c6grid_22;
+    __m128           c6grid_23;
+    __m128           c6grid_31;
+    __m128           c6grid_32;
+    __m128           c6grid_33;
+    real             *vdwgridparam;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
+    beta2            = _mm_mul_ps(beta,beta);
+    beta3            = _mm_mul_ps(beta,beta2);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+           vvdw6            = _mm_mul_ps(_mm_macc_ps(-c6grid_00,_mm_sub_ps(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_msub_ps(_mm_nmacc_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                          _mm_mul_ps(_mm_sub_ps(vvdw6,_mm_macc_ps(c6grid_00,sh_lj_ewald,_mm_mul_ps(c6_00,sh_vdw_invrcut6))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _mm_mul_ps(_mm_add_ps(vvdw12,_mm_msub_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq11);
+            rinv3            = _mm_mul_ps(rinvsq11,rinv11);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq11,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv11,sh_ewald));
+            velec            = _mm_mul_ps(qq11,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx11,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
+
+            fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq12);
+            rinv3            = _mm_mul_ps(rinvsq12,rinv12);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq12,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv12,sh_ewald));
+            velec            = _mm_mul_ps(qq12,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx12,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
+
+            fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq13);
+            rinv3            = _mm_mul_ps(rinvsq13,rinv13);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq13,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv13,sh_ewald));
+            velec            = _mm_mul_ps(qq13,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx13,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
+
+            fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
+            fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
+            fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq21);
+            rinv3            = _mm_mul_ps(rinvsq21,rinv21);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq21,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv21,sh_ewald));
+            velec            = _mm_mul_ps(qq21,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx21,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
+
+            fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq22);
+            rinv3            = _mm_mul_ps(rinvsq22,rinv22);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq22,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv22,sh_ewald));
+            velec            = _mm_mul_ps(qq22,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx22,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
+
+            fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq23);
+            rinv3            = _mm_mul_ps(rinvsq23,rinv23);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq23,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv23,sh_ewald));
+            velec            = _mm_mul_ps(qq23,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx23,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
+
+            fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
+            fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
+            fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq31);
+            rinv3            = _mm_mul_ps(rinvsq31,rinv31);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq31,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv31,sh_ewald));
+            velec            = _mm_mul_ps(qq31,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix3             = _mm_macc_ps(dx31,fscal,fix3);
+            fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
+            fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
+
+            fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq32);
+            rinv3            = _mm_mul_ps(rinvsq32,rinv32);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq32,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv32,sh_ewald));
+            velec            = _mm_mul_ps(qq32,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix3             = _mm_macc_ps(dx32,fscal,fix3);
+            fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
+            fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
+
+            fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq33);
+            rinv3            = _mm_mul_ps(rinvsq33,rinv33);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq33,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv33,sh_ewald));
+            velec            = _mm_mul_ps(qq33,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix3             = _mm_macc_ps(dx33,fscal,fix3);
+            fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
+            fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
+
+            fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
+            fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
+            fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
+
+            }
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 359 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+           vvdw6            = _mm_mul_ps(_mm_macc_ps(-c6grid_00,_mm_sub_ps(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_msub_ps(_mm_nmacc_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                          _mm_mul_ps(_mm_sub_ps(vvdw6,_mm_macc_ps(c6grid_00,sh_lj_ewald,_mm_mul_ps(c6_00,sh_vdw_invrcut6))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _mm_mul_ps(_mm_add_ps(vvdw12,_mm_msub_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq11);
+            rinv3            = _mm_mul_ps(rinvsq11,rinv11);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq11,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv11,sh_ewald));
+            velec            = _mm_mul_ps(qq11,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx11,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
+
+            fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq12);
+            rinv3            = _mm_mul_ps(rinvsq12,rinv12);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq12,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv12,sh_ewald));
+            velec            = _mm_mul_ps(qq12,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx12,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
+
+            fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+            r13              = _mm_andnot_ps(dummy_mask,r13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq13);
+            rinv3            = _mm_mul_ps(rinvsq13,rinv13);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq13,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv13,sh_ewald));
+            velec            = _mm_mul_ps(qq13,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx13,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
+
+            fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
+            fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
+            fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq21);
+            rinv3            = _mm_mul_ps(rinvsq21,rinv21);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq21,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv21,sh_ewald));
+            velec            = _mm_mul_ps(qq21,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx21,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
+
+            fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq22);
+            rinv3            = _mm_mul_ps(rinvsq22,rinv22);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq22,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv22,sh_ewald));
+            velec            = _mm_mul_ps(qq22,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx22,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
+
+            fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+            r23              = _mm_andnot_ps(dummy_mask,r23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq23);
+            rinv3            = _mm_mul_ps(rinvsq23,rinv23);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq23,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv23,sh_ewald));
+            velec            = _mm_mul_ps(qq23,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx23,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
+
+            fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
+            fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
+            fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+            r31              = _mm_andnot_ps(dummy_mask,r31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq31);
+            rinv3            = _mm_mul_ps(rinvsq31,rinv31);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq31,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv31,sh_ewald));
+            velec            = _mm_mul_ps(qq31,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix3             = _mm_macc_ps(dx31,fscal,fix3);
+            fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
+            fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
+
+            fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+            r32              = _mm_andnot_ps(dummy_mask,r32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq32);
+            rinv3            = _mm_mul_ps(rinvsq32,rinv32);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq32,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv32,sh_ewald));
+            velec            = _mm_mul_ps(qq32,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix3             = _mm_macc_ps(dx32,fscal,fix3);
+            fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
+            fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
+
+            fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+            r33              = _mm_andnot_ps(dummy_mask,r33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq33);
+            rinv3            = _mm_mul_ps(rinvsq33,rinv33);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq33,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv33,sh_ewald));
+            velec            = _mm_mul_ps(qq33,velec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix3             = _mm_macc_ps(dx33,fscal,fix3);
+            fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
+            fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
+
+            fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
+            fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
+            fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
+
+            }
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 369 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*369);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_avx_128_fma_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_avx_128_fma_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_11;
+    __m128           c6grid_12;
+    __m128           c6grid_13;
+    __m128           c6grid_21;
+    __m128           c6grid_22;
+    __m128           c6grid_23;
+    __m128           c6grid_31;
+    __m128           c6grid_32;
+    __m128           c6grid_33;
+    real             *vdwgridparam;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
+    beta2            = _mm_mul_ps(beta,beta);
+    beta3            = _mm_mul_ps(beta,beta2);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_macc_ps(_mm_msub_ps(c12_00,rinvsix,_mm_sub_ps(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq11);
+            rinv3            = _mm_mul_ps(rinvsq11,rinv11);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq11,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx11,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
+
+            fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq12);
+            rinv3            = _mm_mul_ps(rinvsq12,rinv12);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq12,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx12,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
+
+            fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq13);
+            rinv3            = _mm_mul_ps(rinvsq13,rinv13);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq13,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx13,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
+
+            fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
+            fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
+            fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq21);
+            rinv3            = _mm_mul_ps(rinvsq21,rinv21);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq21,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx21,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
+
+            fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq22);
+            rinv3            = _mm_mul_ps(rinvsq22,rinv22);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq22,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx22,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
+
+            fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq23);
+            rinv3            = _mm_mul_ps(rinvsq23,rinv23);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq23,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx23,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
+
+            fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
+            fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
+            fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq31);
+            rinv3            = _mm_mul_ps(rinvsq31,rinv31);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq31,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix3             = _mm_macc_ps(dx31,fscal,fix3);
+            fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
+            fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
+
+            fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq32);
+            rinv3            = _mm_mul_ps(rinvsq32,rinv32);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq32,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix3             = _mm_macc_ps(dx32,fscal,fix3);
+            fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
+            fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
+
+            fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq33);
+            rinv3            = _mm_mul_ps(rinvsq33,rinv33);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq33,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix3             = _mm_macc_ps(dx33,fscal,fix3);
+            fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
+            fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
+
+            fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
+            fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
+            fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
+
+            }
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 332 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_macc_ps(_mm_msub_ps(c12_00,rinvsix,_mm_sub_ps(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq11);
+            rinv3            = _mm_mul_ps(rinvsq11,rinv11);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq11,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx11,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
+
+            fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq12);
+            rinv3            = _mm_mul_ps(rinvsq12,rinv12);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq12,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx12,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
+
+            fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+            r13              = _mm_andnot_ps(dummy_mask,r13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq13);
+            rinv3            = _mm_mul_ps(rinvsq13,rinv13);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq13,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx13,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
+
+            fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
+            fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
+            fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq21);
+            rinv3            = _mm_mul_ps(rinvsq21,rinv21);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq21,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx21,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
+
+            fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq22);
+            rinv3            = _mm_mul_ps(rinvsq22,rinv22);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq22,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx22,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
+
+            fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+            r23              = _mm_andnot_ps(dummy_mask,r23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq23);
+            rinv3            = _mm_mul_ps(rinvsq23,rinv23);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq23,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx23,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
+
+            fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
+            fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
+            fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+            r31              = _mm_andnot_ps(dummy_mask,r31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq31);
+            rinv3            = _mm_mul_ps(rinvsq31,rinv31);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq31,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix3             = _mm_macc_ps(dx31,fscal,fix3);
+            fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
+            fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
+
+            fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+            r32              = _mm_andnot_ps(dummy_mask,r32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq32);
+            rinv3            = _mm_mul_ps(rinvsq32,rinv32);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq32,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix3             = _mm_macc_ps(dx32,fscal,fix3);
+            fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
+            fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
+
+            fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+            r33              = _mm_andnot_ps(dummy_mask,r33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq33);
+            rinv3            = _mm_mul_ps(rinvsq33,rinv33);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq33,felec);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix3             = _mm_macc_ps(dx33,fscal,fix3);
+            fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
+            fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
+
+            fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
+            fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
+            fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
+
+            }
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 342 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*342);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEw_VdwLJEw_GeomP1P1_avx_128_fma_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEw_VdwLJEw_GeomP1P1_avx_128_fma_single.c
new file mode 100644 (file)
index 0000000..cd5354a
--- /dev/null
@@ -0,0 +1,790 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_128_fma_single.h"
+#include "kernelutil_x86_avx_128_fma_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_avx_128_fma_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_avx_128_fma_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    real             *vdwgridparam;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
+    beta2            = _mm_mul_ps(beta,beta);
+    beta3            = _mm_mul_ps(beta,beta2);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq00);
+            rinv3            = _mm_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq00,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv00);
+            velec            = _mm_mul_ps(qq00,velec);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+           vvdw6            = _mm_mul_ps(_mm_macc_ps(-c6grid_00,_mm_sub_ps(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _mm_mul_ps(_mm_add_ps(vvdw12,_mm_msub_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   _mm_mul_ps(dx00,fscal),
+                                                   _mm_mul_ps(dy00,fscal),
+                                                   _mm_mul_ps(dz00,fscal));
+
+            /* Inner loop uses 53 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq00);
+            rinv3            = _mm_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq00,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv00);
+            velec            = _mm_mul_ps(qq00,velec);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+           vvdw6            = _mm_mul_ps(_mm_macc_ps(-c6grid_00,_mm_sub_ps(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _mm_mul_ps(_mm_add_ps(vvdw12,_mm_msub_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   _mm_mul_ps(dx00,fscal),
+                                                   _mm_mul_ps(dy00,fscal),
+                                                   _mm_mul_ps(dz00,fscal));
+
+            /* Inner loop uses 54 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*54);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_avx_128_fma_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_avx_128_fma_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    real             *vdwgridparam;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
+    beta2            = _mm_mul_ps(beta,beta);
+    beta3            = _mm_mul_ps(beta,beta2);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq00);
+            rinv3            = _mm_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq00,felec);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_macc_ps(_mm_msub_ps(c12_00,rinvsix,_mm_sub_ps(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   _mm_mul_ps(dx00,fscal),
+                                                   _mm_mul_ps(dy00,fscal),
+                                                   _mm_mul_ps(dz00,fscal));
+
+            /* Inner loop uses 49 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq00);
+            rinv3            = _mm_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq00,felec);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_macc_ps(_mm_msub_ps(c12_00,rinvsix,_mm_sub_ps(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   _mm_mul_ps(dx00,fscal),
+                                                   _mm_mul_ps(dy00,fscal),
+                                                   _mm_mul_ps(dz00,fscal));
+
+            /* Inner loop uses 50 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*50);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEw_VdwLJEw_GeomW3P1_avx_128_fma_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEw_VdwLJEw_GeomW3P1_avx_128_fma_single.c
new file mode 100644 (file)
index 0000000..8d217f6
--- /dev/null
@@ -0,0 +1,1168 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_128_fma_single.h"
+#include "kernelutil_x86_avx_128_fma_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_avx_128_fma_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_avx_128_fma_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_10;
+    __m128           c6grid_20;
+    real             *vdwgridparam;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
+    beta2            = _mm_mul_ps(beta,beta);
+    beta3            = _mm_mul_ps(beta,beta2);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq00);
+            rinv3            = _mm_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq00,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv00);
+            velec            = _mm_mul_ps(qq00,velec);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+           vvdw6            = _mm_mul_ps(_mm_macc_ps(-c6grid_00,_mm_sub_ps(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _mm_mul_ps(_mm_add_ps(vvdw12,_mm_msub_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq10);
+            rinv3            = _mm_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq10,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv10);
+            velec            = _mm_mul_ps(qq10,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx10,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
+
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq20);
+            rinv3            = _mm_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq20,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv20);
+            velec            = _mm_mul_ps(qq20,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx20,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
+
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 111 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq00);
+            rinv3            = _mm_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq00,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv00);
+            velec            = _mm_mul_ps(qq00,velec);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+           vvdw6            = _mm_mul_ps(_mm_macc_ps(-c6grid_00,_mm_sub_ps(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _mm_mul_ps(_mm_add_ps(vvdw12,_mm_msub_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq10);
+            rinv3            = _mm_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq10,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv10);
+            velec            = _mm_mul_ps(qq10,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx10,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
+
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq20);
+            rinv3            = _mm_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq20,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv20);
+            velec            = _mm_mul_ps(qq20,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx20,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
+
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 114 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*114);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_avx_128_fma_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_avx_128_fma_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_10;
+    __m128           c6grid_20;
+    real             *vdwgridparam;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
+    beta2            = _mm_mul_ps(beta,beta);
+    beta3            = _mm_mul_ps(beta,beta2);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq00);
+            rinv3            = _mm_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq00,felec);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_macc_ps(_mm_msub_ps(c12_00,rinvsix,_mm_sub_ps(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq10);
+            rinv3            = _mm_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq10,felec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx10,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
+
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq20);
+            rinv3            = _mm_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq20,felec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx20,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
+
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 105 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq00);
+            rinv3            = _mm_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq00,felec);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_macc_ps(_mm_msub_ps(c12_00,rinvsix,_mm_sub_ps(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq10);
+            rinv3            = _mm_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq10,felec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx10,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
+
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq20);
+            rinv3            = _mm_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq20,felec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx20,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
+
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 108 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*108);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEw_VdwLJEw_GeomW3W3_avx_128_fma_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEw_VdwLJEw_GeomW3W3_avx_128_fma_single.c
new file mode 100644 (file)
index 0000000..51438bf
--- /dev/null
@@ -0,0 +1,2036 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_128_fma_single.h"
+#include "kernelutil_x86_avx_128_fma_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_avx_128_fma_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_avx_128_fma_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_01;
+    __m128           c6grid_02;
+    __m128           c6grid_10;
+    __m128           c6grid_11;
+    __m128           c6grid_12;
+    __m128           c6grid_20;
+    __m128           c6grid_21;
+    __m128           c6grid_22;
+    real             *vdwgridparam;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
+    beta2            = _mm_mul_ps(beta,beta);
+    beta3            = _mm_mul_ps(beta,beta2);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_ps(iq0,jq0);
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq00);
+            rinv3            = _mm_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq00,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv00);
+            velec            = _mm_mul_ps(qq00,velec);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+           vvdw6            = _mm_mul_ps(_mm_macc_ps(-c6grid_00,_mm_sub_ps(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _mm_mul_ps(_mm_add_ps(vvdw12,_mm_msub_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq01);
+            rinv3            = _mm_mul_ps(rinvsq01,rinv01);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq01,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv01);
+            velec            = _mm_mul_ps(qq01,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx01,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
+
+            fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq02);
+            rinv3            = _mm_mul_ps(rinvsq02,rinv02);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq02,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv02);
+            velec            = _mm_mul_ps(qq02,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx02,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
+
+            fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq10);
+            rinv3            = _mm_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq10,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv10);
+            velec            = _mm_mul_ps(qq10,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx10,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
+
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq11);
+            rinv3            = _mm_mul_ps(rinvsq11,rinv11);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq11,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv11);
+            velec            = _mm_mul_ps(qq11,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx11,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
+
+            fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq12);
+            rinv3            = _mm_mul_ps(rinvsq12,rinv12);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq12,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv12);
+            velec            = _mm_mul_ps(qq12,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx12,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
+
+            fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq20);
+            rinv3            = _mm_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq20,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv20);
+            velec            = _mm_mul_ps(qq20,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx20,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
+
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq21);
+            rinv3            = _mm_mul_ps(rinvsq21,rinv21);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq21,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv21);
+            velec            = _mm_mul_ps(qq21,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx21,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
+
+            fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq22);
+            rinv3            = _mm_mul_ps(rinvsq22,rinv22);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq22,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv22);
+            velec            = _mm_mul_ps(qq22,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx22,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
+
+            fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 285 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq00);
+            rinv3            = _mm_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq00,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv00);
+            velec            = _mm_mul_ps(qq00,velec);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+           vvdw6            = _mm_mul_ps(_mm_macc_ps(-c6grid_00,_mm_sub_ps(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _mm_mul_ps(_mm_add_ps(vvdw12,_mm_msub_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+            r01              = _mm_andnot_ps(dummy_mask,r01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq01);
+            rinv3            = _mm_mul_ps(rinvsq01,rinv01);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq01,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv01);
+            velec            = _mm_mul_ps(qq01,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx01,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
+
+            fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+            r02              = _mm_andnot_ps(dummy_mask,r02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq02);
+            rinv3            = _mm_mul_ps(rinvsq02,rinv02);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq02,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv02);
+            velec            = _mm_mul_ps(qq02,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx02,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
+
+            fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq10);
+            rinv3            = _mm_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq10,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv10);
+            velec            = _mm_mul_ps(qq10,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx10,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
+
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq11);
+            rinv3            = _mm_mul_ps(rinvsq11,rinv11);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq11,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv11);
+            velec            = _mm_mul_ps(qq11,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx11,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
+
+            fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq12);
+            rinv3            = _mm_mul_ps(rinvsq12,rinv12);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq12,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv12);
+            velec            = _mm_mul_ps(qq12,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx12,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
+
+            fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq20);
+            rinv3            = _mm_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq20,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv20);
+            velec            = _mm_mul_ps(qq20,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx20,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
+
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq21);
+            rinv3            = _mm_mul_ps(rinvsq21,rinv21);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq21,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv21);
+            velec            = _mm_mul_ps(qq21,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx21,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
+
+            fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq22);
+            rinv3            = _mm_mul_ps(rinvsq22,rinv22);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq22,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv22);
+            velec            = _mm_mul_ps(qq22,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx22,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
+
+            fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 294 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*294);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_avx_128_fma_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_avx_128_fma_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_01;
+    __m128           c6grid_02;
+    __m128           c6grid_10;
+    __m128           c6grid_11;
+    __m128           c6grid_12;
+    __m128           c6grid_20;
+    __m128           c6grid_21;
+    __m128           c6grid_22;
+    real             *vdwgridparam;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
+    beta2            = _mm_mul_ps(beta,beta);
+    beta3            = _mm_mul_ps(beta,beta2);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_ps(iq0,jq0);
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq00);
+            rinv3            = _mm_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq00,felec);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_macc_ps(_mm_msub_ps(c12_00,rinvsix,_mm_sub_ps(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq01);
+            rinv3            = _mm_mul_ps(rinvsq01,rinv01);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq01,felec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx01,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
+
+            fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq02);
+            rinv3            = _mm_mul_ps(rinvsq02,rinv02);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq02,felec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx02,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
+
+            fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq10);
+            rinv3            = _mm_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq10,felec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx10,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
+
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq11);
+            rinv3            = _mm_mul_ps(rinvsq11,rinv11);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq11,felec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx11,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
+
+            fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq12);
+            rinv3            = _mm_mul_ps(rinvsq12,rinv12);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq12,felec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx12,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
+
+            fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq20);
+            rinv3            = _mm_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq20,felec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx20,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
+
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq21);
+            rinv3            = _mm_mul_ps(rinvsq21,rinv21);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq21,felec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx21,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
+
+            fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq22);
+            rinv3            = _mm_mul_ps(rinvsq22,rinv22);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq22,felec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx22,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
+
+            fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 273 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq00);
+            rinv3            = _mm_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq00,felec);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_macc_ps(_mm_msub_ps(c12_00,rinvsix,_mm_sub_ps(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+            r01              = _mm_andnot_ps(dummy_mask,r01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq01);
+            rinv3            = _mm_mul_ps(rinvsq01,rinv01);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq01,felec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx01,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
+
+            fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+            r02              = _mm_andnot_ps(dummy_mask,r02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq02);
+            rinv3            = _mm_mul_ps(rinvsq02,rinv02);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq02,felec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx02,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
+
+            fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq10);
+            rinv3            = _mm_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq10,felec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx10,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
+
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq11);
+            rinv3            = _mm_mul_ps(rinvsq11,rinv11);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq11,felec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx11,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
+
+            fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq12);
+            rinv3            = _mm_mul_ps(rinvsq12,rinv12);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq12,felec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx12,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
+
+            fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq20);
+            rinv3            = _mm_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq20,felec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx20,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
+
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq21);
+            rinv3            = _mm_mul_ps(rinvsq21,rinv21);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq21,felec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx21,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
+
+            fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq22);
+            rinv3            = _mm_mul_ps(rinvsq22,rinv22);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq22,felec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx22,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
+
+            fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 282 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*282);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEw_VdwLJEw_GeomW4P1_avx_128_fma_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEw_VdwLJEw_GeomW4P1_avx_128_fma_single.c
new file mode 100644 (file)
index 0000000..32025d7
--- /dev/null
@@ -0,0 +1,1292 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_128_fma_single.h"
+#include "kernelutil_x86_avx_128_fma_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_avx_128_fma_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_avx_128_fma_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_10;
+    __m128           c6grid_20;
+    __m128           c6grid_30;
+    real             *vdwgridparam;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
+    beta2            = _mm_mul_ps(beta,beta);
+    beta3            = _mm_mul_ps(beta,beta2);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+           vvdw6            = _mm_mul_ps(_mm_macc_ps(-c6grid_00,_mm_sub_ps(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _mm_mul_ps(_mm_add_ps(vvdw12,_mm_msub_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq10);
+            rinv3            = _mm_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq10,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv10);
+            velec            = _mm_mul_ps(qq10,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx10,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
+
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq20);
+            rinv3            = _mm_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq20,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv20);
+            velec            = _mm_mul_ps(qq20,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx20,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
+
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq30);
+            rinv3            = _mm_mul_ps(rinvsq30,rinv30);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq30,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv30);
+            velec            = _mm_mul_ps(qq30,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix3             = _mm_macc_ps(dx30,fscal,fix3);
+            fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
+            fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
+
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 137 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+           vvdw6            = _mm_mul_ps(_mm_macc_ps(-c6grid_00,_mm_sub_ps(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _mm_mul_ps(_mm_add_ps(vvdw12,_mm_msub_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq10);
+            rinv3            = _mm_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq10,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv10);
+            velec            = _mm_mul_ps(qq10,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx10,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
+
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq20);
+            rinv3            = _mm_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq20,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv20);
+            velec            = _mm_mul_ps(qq20,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx20,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
+
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+            r30              = _mm_andnot_ps(dummy_mask,r30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq30);
+            rinv3            = _mm_mul_ps(rinvsq30,rinv30);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq30,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv30);
+            velec            = _mm_mul_ps(qq30,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix3             = _mm_macc_ps(dx30,fscal,fix3);
+            fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
+            fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
+
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 141 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*141);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_avx_128_fma_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_avx_128_fma_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_10;
+    __m128           c6grid_20;
+    __m128           c6grid_30;
+    real             *vdwgridparam;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
+    beta2            = _mm_mul_ps(beta,beta);
+    beta3            = _mm_mul_ps(beta,beta2);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_macc_ps(_mm_msub_ps(c12_00,rinvsix,_mm_sub_ps(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq10);
+            rinv3            = _mm_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq10,felec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx10,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
+
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq20);
+            rinv3            = _mm_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq20,felec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx20,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
+
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq30);
+            rinv3            = _mm_mul_ps(rinvsq30,rinv30);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq30,felec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix3             = _mm_macc_ps(dx30,fscal,fix3);
+            fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
+            fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
+
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 131 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_macc_ps(_mm_msub_ps(c12_00,rinvsix,_mm_sub_ps(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq10);
+            rinv3            = _mm_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq10,felec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx10,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
+
+            fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq20);
+            rinv3            = _mm_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq20,felec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx20,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
+
+            fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+            r30              = _mm_andnot_ps(dummy_mask,r30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq30);
+            rinv3            = _mm_mul_ps(rinvsq30,rinv30);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq30,felec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix3             = _mm_macc_ps(dx30,fscal,fix3);
+            fiy3             = _mm_macc_ps(dy30,fscal,fiy3);
+            fiz3             = _mm_macc_ps(dz30,fscal,fiz3);
+
+            fjx0             = _mm_macc_ps(dx30,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy30,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz30,fscal,fjz0);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 135 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*135);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEw_VdwLJEw_GeomW4W4_avx_128_fma_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEw_VdwLJEw_GeomW4W4_avx_128_fma_single.c
new file mode 100644 (file)
index 0000000..49c8dd6
--- /dev/null
@@ -0,0 +1,2176 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_128_fma_single.h"
+#include "kernelutil_x86_avx_128_fma_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_avx_128_fma_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_avx_128_fma_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_11;
+    __m128           c6grid_12;
+    __m128           c6grid_13;
+    __m128           c6grid_21;
+    __m128           c6grid_22;
+    __m128           c6grid_23;
+    __m128           c6grid_31;
+    __m128           c6grid_32;
+    __m128           c6grid_33;
+    real             *vdwgridparam;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
+    beta2            = _mm_mul_ps(beta,beta);
+    beta3            = _mm_mul_ps(beta,beta2);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+           vvdw6            = _mm_mul_ps(_mm_macc_ps(-c6grid_00,_mm_sub_ps(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _mm_mul_ps(_mm_add_ps(vvdw12,_mm_msub_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq11);
+            rinv3            = _mm_mul_ps(rinvsq11,rinv11);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq11,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv11);
+            velec            = _mm_mul_ps(qq11,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx11,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
+
+            fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq12);
+            rinv3            = _mm_mul_ps(rinvsq12,rinv12);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq12,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv12);
+            velec            = _mm_mul_ps(qq12,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx12,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
+
+            fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq13);
+            rinv3            = _mm_mul_ps(rinvsq13,rinv13);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq13,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv13);
+            velec            = _mm_mul_ps(qq13,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx13,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
+
+            fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
+            fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
+            fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq21);
+            rinv3            = _mm_mul_ps(rinvsq21,rinv21);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq21,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv21);
+            velec            = _mm_mul_ps(qq21,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx21,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
+
+            fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq22);
+            rinv3            = _mm_mul_ps(rinvsq22,rinv22);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq22,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv22);
+            velec            = _mm_mul_ps(qq22,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx22,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
+
+            fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq23);
+            rinv3            = _mm_mul_ps(rinvsq23,rinv23);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq23,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv23);
+            velec            = _mm_mul_ps(qq23,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx23,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
+
+            fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
+            fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
+            fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq31);
+            rinv3            = _mm_mul_ps(rinvsq31,rinv31);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq31,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv31);
+            velec            = _mm_mul_ps(qq31,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix3             = _mm_macc_ps(dx31,fscal,fix3);
+            fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
+            fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
+
+            fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq32);
+            rinv3            = _mm_mul_ps(rinvsq32,rinv32);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq32,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv32);
+            velec            = _mm_mul_ps(qq32,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix3             = _mm_macc_ps(dx32,fscal,fix3);
+            fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
+            fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
+
+            fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq33);
+            rinv3            = _mm_mul_ps(rinvsq33,rinv33);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq33,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv33);
+            velec            = _mm_mul_ps(qq33,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix3             = _mm_macc_ps(dx33,fscal,fix3);
+            fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
+            fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
+
+            fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
+            fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
+            fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 314 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+           vvdw6            = _mm_mul_ps(_mm_macc_ps(-c6grid_00,_mm_sub_ps(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _mm_mul_ps(_mm_add_ps(vvdw12,_mm_msub_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq11);
+            rinv3            = _mm_mul_ps(rinvsq11,rinv11);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq11,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv11);
+            velec            = _mm_mul_ps(qq11,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx11,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
+
+            fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq12);
+            rinv3            = _mm_mul_ps(rinvsq12,rinv12);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq12,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv12);
+            velec            = _mm_mul_ps(qq12,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx12,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
+
+            fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+            r13              = _mm_andnot_ps(dummy_mask,r13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq13);
+            rinv3            = _mm_mul_ps(rinvsq13,rinv13);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq13,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv13);
+            velec            = _mm_mul_ps(qq13,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx13,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
+
+            fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
+            fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
+            fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq21);
+            rinv3            = _mm_mul_ps(rinvsq21,rinv21);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq21,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv21);
+            velec            = _mm_mul_ps(qq21,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx21,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
+
+            fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq22);
+            rinv3            = _mm_mul_ps(rinvsq22,rinv22);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq22,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv22);
+            velec            = _mm_mul_ps(qq22,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx22,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
+
+            fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+            r23              = _mm_andnot_ps(dummy_mask,r23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq23);
+            rinv3            = _mm_mul_ps(rinvsq23,rinv23);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq23,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv23);
+            velec            = _mm_mul_ps(qq23,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx23,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
+
+            fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
+            fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
+            fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+            r31              = _mm_andnot_ps(dummy_mask,r31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq31);
+            rinv3            = _mm_mul_ps(rinvsq31,rinv31);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq31,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv31);
+            velec            = _mm_mul_ps(qq31,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix3             = _mm_macc_ps(dx31,fscal,fix3);
+            fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
+            fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
+
+            fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+            r32              = _mm_andnot_ps(dummy_mask,r32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq32);
+            rinv3            = _mm_mul_ps(rinvsq32,rinv32);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq32,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv32);
+            velec            = _mm_mul_ps(qq32,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix3             = _mm_macc_ps(dx32,fscal,fix3);
+            fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
+            fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
+
+            fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+            r33              = _mm_andnot_ps(dummy_mask,r33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq33);
+            rinv3            = _mm_mul_ps(rinvsq33,rinv33);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq33,felec);
+            pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
+            velec            = _mm_nmacc_ps(pmecorrV,beta,rinv33);
+            velec            = _mm_mul_ps(qq33,velec);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix3             = _mm_macc_ps(dx33,fscal,fix3);
+            fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
+            fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
+
+            fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
+            fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
+            fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 324 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*324);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_avx_128_fma_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_avx_128_fma_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_11;
+    __m128           c6grid_12;
+    __m128           c6grid_13;
+    __m128           c6grid_21;
+    __m128           c6grid_22;
+    __m128           c6grid_23;
+    __m128           c6grid_31;
+    __m128           c6grid_32;
+    __m128           c6grid_33;
+    real             *vdwgridparam;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
+    beta2            = _mm_mul_ps(beta,beta);
+    beta3            = _mm_mul_ps(beta,beta2);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_macc_ps(_mm_msub_ps(c12_00,rinvsix,_mm_sub_ps(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq11);
+            rinv3            = _mm_mul_ps(rinvsq11,rinv11);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq11,felec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx11,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
+
+            fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq12);
+            rinv3            = _mm_mul_ps(rinvsq12,rinv12);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq12,felec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx12,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
+
+            fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq13);
+            rinv3            = _mm_mul_ps(rinvsq13,rinv13);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq13,felec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx13,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
+
+            fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
+            fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
+            fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq21);
+            rinv3            = _mm_mul_ps(rinvsq21,rinv21);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq21,felec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx21,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
+
+            fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq22);
+            rinv3            = _mm_mul_ps(rinvsq22,rinv22);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq22,felec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx22,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
+
+            fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq23);
+            rinv3            = _mm_mul_ps(rinvsq23,rinv23);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq23,felec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx23,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
+
+            fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
+            fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
+            fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq31);
+            rinv3            = _mm_mul_ps(rinvsq31,rinv31);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq31,felec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix3             = _mm_macc_ps(dx31,fscal,fix3);
+            fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
+            fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
+
+            fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq32);
+            rinv3            = _mm_mul_ps(rinvsq32,rinv32);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq32,felec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix3             = _mm_macc_ps(dx32,fscal,fix3);
+            fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
+            fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
+
+            fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq33);
+            rinv3            = _mm_mul_ps(rinvsq33,rinv33);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq33,felec);
+
+            fscal            = felec;
+
+             /* Update vectorial force */
+            fix3             = _mm_macc_ps(dx33,fscal,fix3);
+            fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
+            fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
+
+            fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
+            fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
+            fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 302 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_macc_ps(_mm_msub_ps(c12_00,rinvsix,_mm_sub_ps(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
+            fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
+            fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq11);
+            rinv3            = _mm_mul_ps(rinvsq11,rinv11);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq11,felec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx11,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
+
+            fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq12);
+            rinv3            = _mm_mul_ps(rinvsq12,rinv12);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq12,felec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx12,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
+
+            fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+            r13              = _mm_andnot_ps(dummy_mask,r13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq13);
+            rinv3            = _mm_mul_ps(rinvsq13,rinv13);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq13,felec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix1             = _mm_macc_ps(dx13,fscal,fix1);
+            fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
+            fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
+
+            fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
+            fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
+            fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq21);
+            rinv3            = _mm_mul_ps(rinvsq21,rinv21);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq21,felec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx21,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
+
+            fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq22);
+            rinv3            = _mm_mul_ps(rinvsq22,rinv22);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq22,felec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx22,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
+
+            fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+            r23              = _mm_andnot_ps(dummy_mask,r23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq23);
+            rinv3            = _mm_mul_ps(rinvsq23,rinv23);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq23,felec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix2             = _mm_macc_ps(dx23,fscal,fix2);
+            fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
+            fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
+
+            fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
+            fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
+            fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+            r31              = _mm_andnot_ps(dummy_mask,r31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq31);
+            rinv3            = _mm_mul_ps(rinvsq31,rinv31);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq31,felec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix3             = _mm_macc_ps(dx31,fscal,fix3);
+            fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
+            fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
+
+            fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
+            fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
+            fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+            r32              = _mm_andnot_ps(dummy_mask,r32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq32);
+            rinv3            = _mm_mul_ps(rinvsq32,rinv32);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq32,felec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix3             = _mm_macc_ps(dx32,fscal,fix3);
+            fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
+            fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
+
+            fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
+            fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
+            fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+            r33              = _mm_andnot_ps(dummy_mask,r33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Analytical PME correction */
+            zeta2            = _mm_mul_ps(beta2,rsq33);
+            rinv3            = _mm_mul_ps(rinvsq33,rinv33);
+            pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
+            felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
+            felec            = _mm_mul_ps(qq33,felec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix3             = _mm_macc_ps(dx33,fscal,fix3);
+            fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
+            fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
+
+            fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
+            fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
+            fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 312 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*312);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_avx_128_fma_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_avx_128_fma_single.c
new file mode 100644 (file)
index 0000000..013daec
--- /dev/null
@@ -0,0 +1,751 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_128_fma_single.h"
+#include "kernelutil_x86_avx_128_fma_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_avx_128_fma_single
+ * Electrostatics interaction: None
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_avx_128_fma_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    real             *vdwgridparam;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    rcutoff_scalar   = fr->rvdw;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+           vvdw6            = _mm_mul_ps(_mm_macc_ps(-c6grid_00,_mm_sub_ps(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_msub_ps(_mm_nmacc_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                          _mm_mul_ps(_mm_sub_ps(vvdw6,_mm_macc_ps(c6grid_00,sh_lj_ewald,_mm_mul_ps(c6_00,sh_vdw_invrcut6))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _mm_mul_ps(_mm_add_ps(vvdw12,_mm_msub_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   _mm_mul_ps(dx00,fscal),
+                                                   _mm_mul_ps(dy00,fscal),
+                                                   _mm_mul_ps(dz00,fscal));
+
+            }
+
+            /* Inner loop uses 59 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+           vvdw6            = _mm_mul_ps(_mm_macc_ps(-c6grid_00,_mm_sub_ps(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_msub_ps(_mm_nmacc_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                          _mm_mul_ps(_mm_sub_ps(vvdw6,_mm_macc_ps(c6grid_00,sh_lj_ewald,_mm_mul_ps(c6_00,sh_vdw_invrcut6))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _mm_mul_ps(_mm_add_ps(vvdw12,_mm_msub_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   _mm_mul_ps(dx00,fscal),
+                                                   _mm_mul_ps(dy00,fscal),
+                                                   _mm_mul_ps(dz00,fscal));
+
+            }
+
+            /* Inner loop uses 60 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*60);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_avx_128_fma_single
+ * Electrostatics interaction: None
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_avx_128_fma_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    real             *vdwgridparam;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    rcutoff_scalar   = fr->rvdw;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_macc_ps(_mm_msub_ps(c12_00,rinvsix,_mm_sub_ps(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   _mm_mul_ps(dx00,fscal),
+                                                   _mm_mul_ps(dy00,fscal),
+                                                   _mm_mul_ps(dz00,fscal));
+
+            }
+
+            /* Inner loop uses 50 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_macc_ps(_mm_msub_ps(c12_00,rinvsix,_mm_sub_ps(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   _mm_mul_ps(dx00,fscal),
+                                                   _mm_mul_ps(dy00,fscal),
+                                                   _mm_mul_ps(dz00,fscal));
+
+            }
+
+            /* Inner loop uses 51 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 6 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*6 + inneriter*51);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecNone_VdwLJEw_GeomP1P1_avx_128_fma_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecNone_VdwLJEw_GeomP1P1_avx_128_fma_single.c
new file mode 100644 (file)
index 0000000..f2f2674
--- /dev/null
@@ -0,0 +1,697 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_128_fma_single.h"
+#include "kernelutil_x86_avx_128_fma_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_avx_128_fma_single
+ * Electrostatics interaction: None
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_avx_128_fma_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    real             *vdwgridparam;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+           vvdw6            = _mm_mul_ps(_mm_macc_ps(-c6grid_00,_mm_sub_ps(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _mm_mul_ps(_mm_add_ps(vvdw12,_mm_msub_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   _mm_mul_ps(dx00,fscal),
+                                                   _mm_mul_ps(dy00,fscal),
+                                                   _mm_mul_ps(dz00,fscal));
+
+            /* Inner loop uses 50 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+           vvdw6            = _mm_mul_ps(_mm_macc_ps(-c6grid_00,_mm_sub_ps(one,poly),c6_00),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _mm_mul_ps(_mm_add_ps(vvdw12,_mm_msub_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   _mm_mul_ps(dx00,fscal),
+                                                   _mm_mul_ps(dy00,fscal),
+                                                   _mm_mul_ps(dz00,fscal));
+
+            /* Inner loop uses 51 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*51);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_avx_128_fma_single
+ * Electrostatics interaction: None
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_avx_128_fma_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    real             *vdwgridparam;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_macc_ps(_mm_msub_ps(c12_00,rinvsix,_mm_sub_ps(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   _mm_mul_ps(dx00,fscal),
+                                                   _mm_mul_ps(dy00,fscal),
+                                                   _mm_mul_ps(dz00,fscal));
+
+            /* Inner loop uses 47 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_macc_ps(_mm_msub_ps(c12_00,rinvsix,_mm_sub_ps(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+             /* Update vectorial force */
+            fix0             = _mm_macc_ps(dx00,fscal,fix0);
+            fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
+            fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   _mm_mul_ps(dx00,fscal),
+                                                   _mm_mul_ps(dy00,fscal),
+                                                   _mm_mul_ps(dz00,fscal));
+
+            /* Inner loop uses 48 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 6 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*6 + inneriter*48);
+}
index 22b1ca9e314ca037c79e9b8a5ede057963080b75..9f196253652188d9de28bebc558f2c4e048c20bf 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
 
 #include "../nb_kernel.h"
 
+nb_kernel_t nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_avx_128_fma_single;
+nb_kernel_t nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_avx_128_fma_single;
+nb_kernel_t nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_avx_128_fma_single;
+nb_kernel_t nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_avx_128_fma_single;
 nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_avx_128_fma_single;
 nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_avx_128_fma_single;
 nb_kernel_t nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_avx_128_fma_single;
@@ -48,6 +52,16 @@ nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_avx_128_fma_single;
 nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_avx_128_fma_single;
 nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_avx_128_fma_single;
 nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_avx_128_fma_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_avx_128_fma_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_avx_128_fma_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_avx_128_fma_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_avx_128_fma_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_avx_128_fma_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_avx_128_fma_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_avx_128_fma_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_avx_128_fma_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_avx_128_fma_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_avx_128_fma_single;
 nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_avx_128_fma_single;
 nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_avx_128_fma_single;
 nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_avx_128_fma_single;
@@ -78,6 +92,16 @@ nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_avx_128_fma_single;
 nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_avx_128_fma_single;
 nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_avx_128_fma_single;
 nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_avx_128_fma_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_avx_128_fma_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_avx_128_fma_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_avx_128_fma_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_avx_128_fma_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_avx_128_fma_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_avx_128_fma_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_avx_128_fma_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_avx_128_fma_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_avx_128_fma_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_avx_128_fma_single;
 nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_avx_128_fma_single;
 nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_avx_128_fma_single;
 nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_avx_128_fma_single;
@@ -259,6 +283,10 @@ nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_avx_128_fma_single;
 nb_kernel_info_t
     kernellist_avx_128_fma_single[] =
 {
+    { nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_avx_128_fma_single, "nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_avx_128_fma_single", "avx_128_fma_single", "None", "None", "LJEwald", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_avx_128_fma_single, "nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_avx_128_fma_single", "avx_128_fma_single", "None", "None", "LJEwald", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_avx_128_fma_single, "nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_avx_128_fma_single", "avx_128_fma_single", "None", "None", "LJEwald", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_avx_128_fma_single, "nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_avx_128_fma_single", "avx_128_fma_single", "None", "None", "LJEwald", "PotentialShift", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_avx_128_fma_single, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_avx_128_fma_single", "avx_128_fma_single", "None", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_avx_128_fma_single, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_avx_128_fma_single", "avx_128_fma_single", "None", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_avx_128_fma_single, "nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_avx_128_fma_single", "avx_128_fma_single", "None", "None", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
@@ -267,6 +295,16 @@ nb_kernel_info_t
     { nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_avx_128_fma_single, "nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_avx_128_fma_single", "avx_128_fma_single", "None", "None", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_avx_128_fma_single, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_avx_128_fma_single", "avx_128_fma_single", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_avx_128_fma_single, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_avx_128_fma_single", "avx_128_fma_single", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_avx_128_fma_single, "nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "LJEwald", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_avx_128_fma_single, "nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "LJEwald", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_avx_128_fma_single, "nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "LJEwald", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_avx_128_fma_single, "nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "LJEwald", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_avx_128_fma_single, "nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "LJEwald", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_avx_128_fma_single, "nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "LJEwald", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_avx_128_fma_single, "nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "LJEwald", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_avx_128_fma_single, "nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "LJEwald", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_avx_128_fma_single, "nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "LJEwald", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_avx_128_fma_single, "nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "LJEwald", "None", "Water4Water4", "", "Force" },
     { nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_avx_128_fma_single, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_avx_128_fma_single, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_avx_128_fma_single, "nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
@@ -297,6 +335,16 @@ nb_kernel_info_t
     { nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_avx_128_fma_single, "nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
     { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_avx_128_fma_single, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
     { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_avx_128_fma_single, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_avx_128_fma_single, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_avx_128_fma_single, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_avx_128_fma_single, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_avx_128_fma_single, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_avx_128_fma_single, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_avx_128_fma_single, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_avx_128_fma_single, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_avx_128_fma_single, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_avx_128_fma_single, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_avx_128_fma_single, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water4Water4", "", "Force" },
     { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_avx_128_fma_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_avx_128_fma_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_avx_128_fma_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_avx_128_fma_single", "avx_128_fma_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
index 792060656063be6488194ab573e561bcd7458fe3..e2849dc7545204c7cea122dbe189050760c660c8 100644 (file)
@@ -2,7 +2,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -154,6 +154,15 @@ void
     __m128           rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
     real             *vftab;
     /* #endif */
+    /* #if 'LJEwald' in KERNEL_VDW */
+    /* #for I,J in PAIRS_IJ */
+    __m128           c6grid_{I}{J};
+    /* #endfor */
+    real             *vdwgridparam;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    /* #endif */
     /* #if 'Ewald' in KERNEL_ELEC */
     __m128i          ewitab;
     __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
@@ -193,6 +202,12 @@ void
     vdwparam         = fr->nbfp;
     vdwtype          = mdatoms->typeA;
     /* #endif */
+    /* #if 'LJEwald' in KERNEL_VDW */
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+    /* #endif */
 
     /* #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW */
     vftab            = kernel_data->table_elec_vdw->data;
@@ -252,8 +267,14 @@ void
     qq{I}{J}             = _mm_mul_ps(iq{I},jq{J});
     /*         #endif */
     /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
+    /*             #if 'LJEwald' in KERNEL_VDW */
+    c6_{I}{J}            = _mm_set1_ps(vdwparam[vdwioffset{I}+vdwjidx{J}A]);
+    c12_{I}{J}           = _mm_set1_ps(vdwparam[vdwioffset{I}+vdwjidx{J}A+1]);
+    c6grid_{I}{J}        = _mm_set1_ps(vdwgridparam[vdwioffset{I}+vdwjidx{J}A]);
+    /*             #else */
     c6_{I}{J}            = _mm_set1_ps(vdwparam[vdwioffset{I}+vdwjidx{J}A]);
     c12_{I}{J}           = _mm_set1_ps(vdwparam[vdwioffset{I}+vdwjidx{J}A+1]);
+    /*             #endif */
     /*         #endif */
     /*     #endfor */
     /* #endif */
@@ -544,6 +565,14 @@ void
                                          vdwparam+vdwioffset{I}+vdwjidx{J}C,
                                          vdwparam+vdwioffset{I}+vdwjidx{J}D,
                                          &c6_{I}{J},&c12_{I}{J});
+
+            /*           #if 'LJEwald' in KERNEL_VDW */
+            c6grid_{I}{J}       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset{I}+vdwjidx{J}A,
+                                                               vdwgridparam+vdwioffset{I}+vdwjidx{J}B,
+                                                               vdwgridparam+vdwioffset{I}+vdwjidx{J}C,
+                                                               vdwgridparam+vdwioffset{I}+vdwjidx{J}D);
+            /*           #endif */
+
             /*         #endif */
             /*     #endif */
 
@@ -784,6 +813,45 @@ void
             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv{I}{J})));
             /*                 #define INNERFLOPS INNERFLOPS+4 */
             /*             #endif */
+
+            /*         #elif KERNEL_VDW=='LJEwald' */
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq{I}{J},rinvsq{I}{J}),rinvsq{I}{J});
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq{I}{J});
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
+            /*                 #define INNERFLOPS INNERFLOPS+10 */
+            /*             #if 'Potential' in KERNEL_VF or KERNEL_MOD_VDW=='PotentialSwitch' */
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+           vvdw6            = _mm_mul_ps(_mm_macc_ps(-c6grid_{I}{J},_mm_sub_ps(one,poly),c6_{I}{J}),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_{I}{J},_mm_mul_ps(rinvsix,rinvsix));
+           /*                     #define INNERFLOPS INNERFLOPS+5 */
+            /*                 #if KERNEL_MOD_VDW=='PotentialShift' */
+            vvdw             = _mm_msub_ps(_mm_nmacc_ps(c12_{I}{J},_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                          _mm_mul_ps(_mm_sub_ps(vvdw6,_mm_macc_ps(c6grid_{I}{J},sh_lj_ewald,_mm_mul_ps(c6_{I}{J},sh_vdw_invrcut6))),one_sixth));
+            /*                     #define INNERFLOPS INNERFLOPS+7 */
+            /*                 #else */
+            vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
+           /*                     #define INNERFLOPS INNERFLOPS+2 */
+            /*                 #endif */
+            /*                  ## Check for force inside potential check, i.e. this means we already did the potential part */
+            /*                  #if 'Force' in KERNEL_VF */
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _mm_mul_ps(_mm_add_ps(vvdw12,_mm_msub_ps(_mm_mul_ps(c6grid_{I}{J},one_sixth),_mm_mul_ps(exponent,ewclj6),vvdw6)),rinvsq{I}{J});
+            /*                 #define INNERFLOPS INNERFLOPS+5 */
+            /*                  #endif */
+            /*              #elif KERNEL_VF=='Force' */
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_{I}{J},_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_{I}{J},one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_macc_ps(_mm_msub_ps(c12_{I}{J},rinvsix,_mm_sub_ps(c6_{I}{J},f6A)),rinvsix,f6B),rinvsq{I}{J});
+            /*                 #define INNERFLOPS INNERFLOPS+10 */
+            /*              #endif */
             /*         #endif */
             /*         ## End of check for vdw interaction forms */
             /*     #endif */
index 860b751044e9c679d4e5c53ad5167871983e80da..0b0bce00dcd134b5dfac74344d2882730bd936f4 100755 (executable)
@@ -2,7 +2,7 @@
 #
 # This file is part of the GROMACS molecular simulation package.
 #
-# Copyright (c) 2012,2013, by the GROMACS development team, led by
+# Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
 # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 # and including many others, as listed in the AUTHORS file in the
 # top-level source directory and at http://www.gromacs.org.
@@ -120,6 +120,7 @@ VdwList = {
     'LennardJones'            : ['rinvsq'],
 #    'Buckingham'              : ['rinv','rinvsq','r'], # Disabled for AVX_256 to reduce number of kernels and simply the template
     'CubicSplineTable'        : ['rinv','r','table'],
+    'LJEwald'                 : ['rinv','rinvsq','r'],
 }
 
 
@@ -193,6 +194,7 @@ Abbreviation = {
     'CubicSplineTable'        : 'CSTab',
     'LennardJones'            : 'LJ',
     'Buckingham'              : 'Bham',
+    'LJEwald'                 : 'LJEw',
     'PotentialShift'          : 'Sh',
     'PotentialSwitch'         : 'Sw',
     'ExactCutoff'             : 'Cut',
@@ -282,7 +284,11 @@ def KeepKernel(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,Kernel
         return 0
     # For Vdw, we support switch and shift for Lennard-Jones/Buckingham
     if((KernelVdwMod=='ExactCutoff') or
-       (KernelVdwMod in ['PotentialShift','PotentialSwitch'] and KernelVdw not in ['LennardJones','Buckingham'])):
+       (KernelVdwMod in ['PotentialShift','PotentialSwitch'] and KernelVdw not in ['LennardJones','Buckingham','LJEwald'])):
+        return 0
+
+    # For LJEwald, we only support shift
+    if(KernelVdw=='LJEwald' and KernelVdwMod=='PotentialSwitch'):
         return 0
 
     # Choose either switch or shift and don't mix them...
@@ -302,6 +308,10 @@ def KeepKernel(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,Kernel
         elif(KernelElec!='ReactionField'):
             return 0
 
+    #Only do LJ-PME if we are also doing PME for electrostatics, or no electrostatics at all.
+    if(KernelVdw=='LJEwald' and KernelElec not in ['Ewald','None']):
+        return 0
+
     return 1
 
 
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_avx_256_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_avx_256_double.c
new file mode 100644 (file)
index 0000000..61c224c
--- /dev/null
@@ -0,0 +1,892 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_256_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_256_double.h"
+#include "kernelutil_x86_avx_256_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_avx_256_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_avx_256_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
+    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
+    __m256d           c6grid_00;
+    real             *vdwgridparam;
+    __m256d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256d           one_half  = _mm256_set1_pd(0.5);
+    __m256d           minus_one = _mm256_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m256d          dummy_mask,cutoff_mask;
+    __m128           tmpmask0,tmpmask1;
+    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
+    __m256d          one     = _mm256_set1_pd(1.0);
+    __m256d          two     = _mm256_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm256_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
+    beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
+    beta2            = _mm256_mul_pd(beta,beta);
+    beta3            = _mm256_mul_pd(beta,beta2);
+
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm256_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm256_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm256_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm256_setzero_pd();
+        fiy0             = _mm256_setzero_pd();
+        fiz0             = _mm256_setzero_pd();
+
+        /* Load parameters for i particles */
+        iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
+        vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+        vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _mm256_setzero_pd();
+        vvdwsum          = _mm256_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm256_mul_pd(iq0,jq0);
+            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r00,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_sub_pd(rinv00,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_add_pd(_mm256_mul_pd(c6_00,sh_vdw_invrcut6),_mm256_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+            vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
+            vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm256_add_pd(felec,fvdw);
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 82 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
+             */
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+
+            tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
+            tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
+            dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
+
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+            r00              = _mm256_andnot_pd(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm256_mul_pd(iq0,jq0);
+            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r00,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_sub_pd(rinv00,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_add_pd(_mm256_mul_pd(c6_00,sh_vdw_invrcut6),_mm256_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+            vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
+            vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
+            vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm256_add_pd(felec,fvdw);
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 83 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*83);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_avx_256_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_avx_256_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
+    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
+    __m256d           c6grid_00;
+    real             *vdwgridparam;
+    __m256d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256d           one_half  = _mm256_set1_pd(0.5);
+    __m256d           minus_one = _mm256_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m256d          dummy_mask,cutoff_mask;
+    __m128           tmpmask0,tmpmask1;
+    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
+    __m256d          one     = _mm256_set1_pd(1.0);
+    __m256d          two     = _mm256_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm256_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
+    beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
+    beta2            = _mm256_mul_pd(beta,beta);
+    beta3            = _mm256_mul_pd(beta,beta2);
+
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm256_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm256_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm256_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm256_setzero_pd();
+        fiy0             = _mm256_setzero_pd();
+        fiz0             = _mm256_setzero_pd();
+
+        /* Load parameters for i particles */
+        iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
+        vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+        vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm256_mul_pd(iq0,jq0);
+            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r00,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = _mm256_add_pd(felec,fvdw);
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 62 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
+             */
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+
+            tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
+            tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
+            dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
+
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+            r00              = _mm256_andnot_pd(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm256_mul_pd(iq0,jq0);
+            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r00,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = _mm256_add_pd(felec,fvdw);
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 63 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*63);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_avx_256_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_avx_256_double.c
new file mode 100644 (file)
index 0000000..6aeb9dd
--- /dev/null
@@ -0,0 +1,1434 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_256_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_256_double.h"
+#include "kernelutil_x86_avx_256_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_avx_256_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_avx_256_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    real *           vdwioffsetptr1;
+    real *           vdwgridioffsetptr1;
+    __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    real *           vdwioffsetptr2;
+    real *           vdwgridioffsetptr2;
+    __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
+    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
+    __m256d           c6grid_00;
+    __m256d           c6grid_10;
+    __m256d           c6grid_20;
+    real             *vdwgridparam;
+    __m256d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256d           one_half  = _mm256_set1_pd(0.5);
+    __m256d           minus_one = _mm256_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m256d          dummy_mask,cutoff_mask;
+    __m128           tmpmask0,tmpmask1;
+    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
+    __m256d          one     = _mm256_set1_pd(1.0);
+    __m256d          two     = _mm256_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm256_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
+    beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
+    beta2            = _mm256_mul_pd(beta,beta);
+    beta3            = _mm256_mul_pd(beta,beta2);
+
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
+    iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
+    iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
+    vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+    vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm256_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm256_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm256_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                    &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm256_setzero_pd();
+        fiy0             = _mm256_setzero_pd();
+        fiz0             = _mm256_setzero_pd();
+        fix1             = _mm256_setzero_pd();
+        fiy1             = _mm256_setzero_pd();
+        fiz1             = _mm256_setzero_pd();
+        fix2             = _mm256_setzero_pd();
+        fiy2             = _mm256_setzero_pd();
+        fiz2             = _mm256_setzero_pd();
+
+        /* Reset potential sums */
+        velecsum         = _mm256_setzero_pd();
+        vvdwsum          = _mm256_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+            dx10             = _mm256_sub_pd(ix1,jx0);
+            dy10             = _mm256_sub_pd(iy1,jy0);
+            dz10             = _mm256_sub_pd(iz1,jz0);
+            dx20             = _mm256_sub_pd(ix2,jx0);
+            dy20             = _mm256_sub_pd(iy2,jy0);
+            dz20             = _mm256_sub_pd(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm256_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm256_invsqrt_pd(rsq20);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm256_setzero_pd();
+            fjy0             = _mm256_setzero_pd();
+            fjz0             = _mm256_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm256_mul_pd(iq0,jq0);
+            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r00,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_sub_pd(rinv00,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_add_pd(_mm256_mul_pd(c6_00,sh_vdw_invrcut6),_mm256_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+            vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
+            vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm256_add_pd(felec,fvdw);
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm256_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm256_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r10,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_sub_pd(rinv10,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx10);
+            ty               = _mm256_mul_pd(fscal,dy10);
+            tz               = _mm256_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm256_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm256_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r20,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_sub_pd(rinv20,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx20);
+            ty               = _mm256_mul_pd(fscal,dy20);
+            tz               = _mm256_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 177 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
+             */
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+
+            tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
+            tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
+            dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
+
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+            dx10             = _mm256_sub_pd(ix1,jx0);
+            dy10             = _mm256_sub_pd(iy1,jy0);
+            dz10             = _mm256_sub_pd(iz1,jz0);
+            dx20             = _mm256_sub_pd(ix2,jx0);
+            dy20             = _mm256_sub_pd(iy2,jy0);
+            dz20             = _mm256_sub_pd(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm256_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm256_invsqrt_pd(rsq20);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm256_setzero_pd();
+            fjy0             = _mm256_setzero_pd();
+            fjz0             = _mm256_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+            r00              = _mm256_andnot_pd(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm256_mul_pd(iq0,jq0);
+            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r00,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_sub_pd(rinv00,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_add_pd(_mm256_mul_pd(c6_00,sh_vdw_invrcut6),_mm256_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+            vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
+            vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
+            vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm256_add_pd(felec,fvdw);
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm256_mul_pd(rsq10,rinv10);
+            r10              = _mm256_andnot_pd(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm256_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r10,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_sub_pd(rinv10,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx10);
+            ty               = _mm256_mul_pd(fscal,dy10);
+            tz               = _mm256_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm256_mul_pd(rsq20,rinv20);
+            r20              = _mm256_andnot_pd(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm256_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r20,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_sub_pd(rinv20,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx20);
+            ty               = _mm256_mul_pd(fscal,dy20);
+            tz               = _mm256_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 180 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*180);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_avx_256_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_avx_256_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    real *           vdwioffsetptr1;
+    real *           vdwgridioffsetptr1;
+    __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    real *           vdwioffsetptr2;
+    real *           vdwgridioffsetptr2;
+    __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
+    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
+    __m256d           c6grid_00;
+    __m256d           c6grid_10;
+    __m256d           c6grid_20;
+    real             *vdwgridparam;
+    __m256d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256d           one_half  = _mm256_set1_pd(0.5);
+    __m256d           minus_one = _mm256_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m256d          dummy_mask,cutoff_mask;
+    __m128           tmpmask0,tmpmask1;
+    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
+    __m256d          one     = _mm256_set1_pd(1.0);
+    __m256d          two     = _mm256_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm256_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
+    beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
+    beta2            = _mm256_mul_pd(beta,beta);
+    beta3            = _mm256_mul_pd(beta,beta2);
+
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
+    iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
+    iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
+    vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+    vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm256_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm256_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm256_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                    &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm256_setzero_pd();
+        fiy0             = _mm256_setzero_pd();
+        fiz0             = _mm256_setzero_pd();
+        fix1             = _mm256_setzero_pd();
+        fiy1             = _mm256_setzero_pd();
+        fiz1             = _mm256_setzero_pd();
+        fix2             = _mm256_setzero_pd();
+        fiy2             = _mm256_setzero_pd();
+        fiz2             = _mm256_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+            dx10             = _mm256_sub_pd(ix1,jx0);
+            dy10             = _mm256_sub_pd(iy1,jy0);
+            dz10             = _mm256_sub_pd(iz1,jz0);
+            dx20             = _mm256_sub_pd(ix2,jx0);
+            dy20             = _mm256_sub_pd(iy2,jy0);
+            dz20             = _mm256_sub_pd(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm256_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm256_invsqrt_pd(rsq20);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm256_setzero_pd();
+            fjy0             = _mm256_setzero_pd();
+            fjz0             = _mm256_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm256_mul_pd(iq0,jq0);
+            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r00,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = _mm256_add_pd(felec,fvdw);
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm256_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm256_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r10,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx10);
+            ty               = _mm256_mul_pd(fscal,dy10);
+            tz               = _mm256_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm256_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm256_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r20,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx20);
+            ty               = _mm256_mul_pd(fscal,dy20);
+            tz               = _mm256_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 143 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
+             */
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+
+            tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
+            tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
+            dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
+
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+            dx10             = _mm256_sub_pd(ix1,jx0);
+            dy10             = _mm256_sub_pd(iy1,jy0);
+            dz10             = _mm256_sub_pd(iz1,jz0);
+            dx20             = _mm256_sub_pd(ix2,jx0);
+            dy20             = _mm256_sub_pd(iy2,jy0);
+            dz20             = _mm256_sub_pd(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm256_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm256_invsqrt_pd(rsq20);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm256_setzero_pd();
+            fjy0             = _mm256_setzero_pd();
+            fjz0             = _mm256_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+            r00              = _mm256_andnot_pd(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm256_mul_pd(iq0,jq0);
+            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r00,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = _mm256_add_pd(felec,fvdw);
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm256_mul_pd(rsq10,rinv10);
+            r10              = _mm256_andnot_pd(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm256_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r10,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx10);
+            ty               = _mm256_mul_pd(fscal,dy10);
+            tz               = _mm256_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm256_mul_pd(rsq20,rinv20);
+            r20              = _mm256_andnot_pd(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm256_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r20,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx20);
+            ty               = _mm256_mul_pd(fscal,dy20);
+            tz               = _mm256_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 146 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*146);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_avx_256_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_avx_256_double.c
new file mode 100644 (file)
index 0000000..828fec1
--- /dev/null
@@ -0,0 +1,2746 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_256_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_256_double.h"
+#include "kernelutil_x86_avx_256_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_avx_256_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_avx_256_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    real *           vdwioffsetptr1;
+    real *           vdwgridioffsetptr1;
+    __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    real *           vdwioffsetptr2;
+    real *           vdwgridioffsetptr2;
+    __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m256d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m256d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
+    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
+    __m256d           c6grid_00;
+    __m256d           c6grid_01;
+    __m256d           c6grid_02;
+    __m256d           c6grid_10;
+    __m256d           c6grid_11;
+    __m256d           c6grid_12;
+    __m256d           c6grid_20;
+    __m256d           c6grid_21;
+    __m256d           c6grid_22;
+    real             *vdwgridparam;
+    __m256d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256d           one_half  = _mm256_set1_pd(0.5);
+    __m256d           minus_one = _mm256_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m256d          dummy_mask,cutoff_mask;
+    __m128           tmpmask0,tmpmask1;
+    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
+    __m256d          one     = _mm256_set1_pd(1.0);
+    __m256d          two     = _mm256_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm256_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
+    beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
+    beta2            = _mm256_mul_pd(beta,beta);
+    beta3            = _mm256_mul_pd(beta,beta2);
+
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
+    iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
+    iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
+    vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+    vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm256_set1_pd(charge[inr+0]);
+    jq1              = _mm256_set1_pd(charge[inr+1]);
+    jq2              = _mm256_set1_pd(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm256_mul_pd(iq0,jq0);
+    c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
+    c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
+    c6grid_00        = _mm256_set1_pd(vdwgridioffsetptr0[vdwjidx0A]);
+    qq01             = _mm256_mul_pd(iq0,jq1);
+    qq02             = _mm256_mul_pd(iq0,jq2);
+    qq10             = _mm256_mul_pd(iq1,jq0);
+    qq11             = _mm256_mul_pd(iq1,jq1);
+    qq12             = _mm256_mul_pd(iq1,jq2);
+    qq20             = _mm256_mul_pd(iq2,jq0);
+    qq21             = _mm256_mul_pd(iq2,jq1);
+    qq22             = _mm256_mul_pd(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm256_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm256_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm256_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                    &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm256_setzero_pd();
+        fiy0             = _mm256_setzero_pd();
+        fiz0             = _mm256_setzero_pd();
+        fix1             = _mm256_setzero_pd();
+        fiy1             = _mm256_setzero_pd();
+        fiz1             = _mm256_setzero_pd();
+        fix2             = _mm256_setzero_pd();
+        fiy2             = _mm256_setzero_pd();
+        fiz2             = _mm256_setzero_pd();
+
+        /* Reset potential sums */
+        velecsum         = _mm256_setzero_pd();
+        vvdwsum          = _mm256_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+            dx01             = _mm256_sub_pd(ix0,jx1);
+            dy01             = _mm256_sub_pd(iy0,jy1);
+            dz01             = _mm256_sub_pd(iz0,jz1);
+            dx02             = _mm256_sub_pd(ix0,jx2);
+            dy02             = _mm256_sub_pd(iy0,jy2);
+            dz02             = _mm256_sub_pd(iz0,jz2);
+            dx10             = _mm256_sub_pd(ix1,jx0);
+            dy10             = _mm256_sub_pd(iy1,jy0);
+            dz10             = _mm256_sub_pd(iz1,jz0);
+            dx11             = _mm256_sub_pd(ix1,jx1);
+            dy11             = _mm256_sub_pd(iy1,jy1);
+            dz11             = _mm256_sub_pd(iz1,jz1);
+            dx12             = _mm256_sub_pd(ix1,jx2);
+            dy12             = _mm256_sub_pd(iy1,jy2);
+            dz12             = _mm256_sub_pd(iz1,jz2);
+            dx20             = _mm256_sub_pd(ix2,jx0);
+            dy20             = _mm256_sub_pd(iy2,jy0);
+            dz20             = _mm256_sub_pd(iz2,jz0);
+            dx21             = _mm256_sub_pd(ix2,jx1);
+            dy21             = _mm256_sub_pd(iy2,jy1);
+            dz21             = _mm256_sub_pd(iz2,jz1);
+            dx22             = _mm256_sub_pd(ix2,jx2);
+            dy22             = _mm256_sub_pd(iy2,jy2);
+            dz22             = _mm256_sub_pd(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+            rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
+            rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
+            rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
+            rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
+            rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
+            rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+            rinv01           = gmx_mm256_invsqrt_pd(rsq01);
+            rinv02           = gmx_mm256_invsqrt_pd(rsq02);
+            rinv10           = gmx_mm256_invsqrt_pd(rsq10);
+            rinv11           = gmx_mm256_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm256_invsqrt_pd(rsq12);
+            rinv20           = gmx_mm256_invsqrt_pd(rsq20);
+            rinv21           = gmx_mm256_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm256_invsqrt_pd(rsq22);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+            rinvsq01         = _mm256_mul_pd(rinv01,rinv01);
+            rinvsq02         = _mm256_mul_pd(rinv02,rinv02);
+            rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
+            rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
+            rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
+            rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
+
+            fjx0             = _mm256_setzero_pd();
+            fjy0             = _mm256_setzero_pd();
+            fjz0             = _mm256_setzero_pd();
+            fjx1             = _mm256_setzero_pd();
+            fjy1             = _mm256_setzero_pd();
+            fjz1             = _mm256_setzero_pd();
+            fjx2             = _mm256_setzero_pd();
+            fjy2             = _mm256_setzero_pd();
+            fjz2             = _mm256_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r00,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_sub_pd(rinv00,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_add_pd(_mm256_mul_pd(c6_00,sh_vdw_invrcut6),_mm256_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+            vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
+            vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm256_add_pd(felec,fvdw);
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm256_mul_pd(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r01,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_sub_pd(rinv01,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx01);
+            ty               = _mm256_mul_pd(fscal,dy01);
+            tz               = _mm256_mul_pd(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm256_mul_pd(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r02,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_sub_pd(rinv02,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx02);
+            ty               = _mm256_mul_pd(fscal,dy02);
+            tz               = _mm256_mul_pd(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm256_mul_pd(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r10,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_sub_pd(rinv10,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx10);
+            ty               = _mm256_mul_pd(fscal,dy10);
+            tz               = _mm256_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm256_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r11,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_sub_pd(rinv11,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx11);
+            ty               = _mm256_mul_pd(fscal,dy11);
+            tz               = _mm256_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm256_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r12,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_sub_pd(rinv12,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx12);
+            ty               = _mm256_mul_pd(fscal,dy12);
+            tz               = _mm256_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm256_mul_pd(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r20,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_sub_pd(rinv20,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx20);
+            ty               = _mm256_mul_pd(fscal,dy20);
+            tz               = _mm256_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm256_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r21,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_sub_pd(rinv21,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx21);
+            ty               = _mm256_mul_pd(fscal,dy21);
+            tz               = _mm256_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm256_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r22,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_sub_pd(rinv22,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx22);
+            ty               = _mm256_mul_pd(fscal,dy22);
+            tz               = _mm256_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            }
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                      fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 450 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
+             */
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+
+            tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
+            tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
+            dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
+
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+            dx01             = _mm256_sub_pd(ix0,jx1);
+            dy01             = _mm256_sub_pd(iy0,jy1);
+            dz01             = _mm256_sub_pd(iz0,jz1);
+            dx02             = _mm256_sub_pd(ix0,jx2);
+            dy02             = _mm256_sub_pd(iy0,jy2);
+            dz02             = _mm256_sub_pd(iz0,jz2);
+            dx10             = _mm256_sub_pd(ix1,jx0);
+            dy10             = _mm256_sub_pd(iy1,jy0);
+            dz10             = _mm256_sub_pd(iz1,jz0);
+            dx11             = _mm256_sub_pd(ix1,jx1);
+            dy11             = _mm256_sub_pd(iy1,jy1);
+            dz11             = _mm256_sub_pd(iz1,jz1);
+            dx12             = _mm256_sub_pd(ix1,jx2);
+            dy12             = _mm256_sub_pd(iy1,jy2);
+            dz12             = _mm256_sub_pd(iz1,jz2);
+            dx20             = _mm256_sub_pd(ix2,jx0);
+            dy20             = _mm256_sub_pd(iy2,jy0);
+            dz20             = _mm256_sub_pd(iz2,jz0);
+            dx21             = _mm256_sub_pd(ix2,jx1);
+            dy21             = _mm256_sub_pd(iy2,jy1);
+            dz21             = _mm256_sub_pd(iz2,jz1);
+            dx22             = _mm256_sub_pd(ix2,jx2);
+            dy22             = _mm256_sub_pd(iy2,jy2);
+            dz22             = _mm256_sub_pd(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+            rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
+            rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
+            rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
+            rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
+            rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
+            rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+            rinv01           = gmx_mm256_invsqrt_pd(rsq01);
+            rinv02           = gmx_mm256_invsqrt_pd(rsq02);
+            rinv10           = gmx_mm256_invsqrt_pd(rsq10);
+            rinv11           = gmx_mm256_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm256_invsqrt_pd(rsq12);
+            rinv20           = gmx_mm256_invsqrt_pd(rsq20);
+            rinv21           = gmx_mm256_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm256_invsqrt_pd(rsq22);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+            rinvsq01         = _mm256_mul_pd(rinv01,rinv01);
+            rinvsq02         = _mm256_mul_pd(rinv02,rinv02);
+            rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
+            rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
+            rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
+            rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
+
+            fjx0             = _mm256_setzero_pd();
+            fjy0             = _mm256_setzero_pd();
+            fjz0             = _mm256_setzero_pd();
+            fjx1             = _mm256_setzero_pd();
+            fjy1             = _mm256_setzero_pd();
+            fjz1             = _mm256_setzero_pd();
+            fjx2             = _mm256_setzero_pd();
+            fjy2             = _mm256_setzero_pd();
+            fjz2             = _mm256_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+            r00              = _mm256_andnot_pd(dummy_mask,r00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r00,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_sub_pd(rinv00,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_add_pd(_mm256_mul_pd(c6_00,sh_vdw_invrcut6),_mm256_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+            vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
+            vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
+            vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm256_add_pd(felec,fvdw);
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm256_mul_pd(rsq01,rinv01);
+            r01              = _mm256_andnot_pd(dummy_mask,r01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r01,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_sub_pd(rinv01,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx01);
+            ty               = _mm256_mul_pd(fscal,dy01);
+            tz               = _mm256_mul_pd(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm256_mul_pd(rsq02,rinv02);
+            r02              = _mm256_andnot_pd(dummy_mask,r02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r02,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_sub_pd(rinv02,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx02);
+            ty               = _mm256_mul_pd(fscal,dy02);
+            tz               = _mm256_mul_pd(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm256_mul_pd(rsq10,rinv10);
+            r10              = _mm256_andnot_pd(dummy_mask,r10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r10,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_sub_pd(rinv10,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx10);
+            ty               = _mm256_mul_pd(fscal,dy10);
+            tz               = _mm256_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm256_mul_pd(rsq11,rinv11);
+            r11              = _mm256_andnot_pd(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r11,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_sub_pd(rinv11,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx11);
+            ty               = _mm256_mul_pd(fscal,dy11);
+            tz               = _mm256_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm256_mul_pd(rsq12,rinv12);
+            r12              = _mm256_andnot_pd(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r12,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_sub_pd(rinv12,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx12);
+            ty               = _mm256_mul_pd(fscal,dy12);
+            tz               = _mm256_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm256_mul_pd(rsq20,rinv20);
+            r20              = _mm256_andnot_pd(dummy_mask,r20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r20,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_sub_pd(rinv20,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx20);
+            ty               = _mm256_mul_pd(fscal,dy20);
+            tz               = _mm256_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm256_mul_pd(rsq21,rinv21);
+            r21              = _mm256_andnot_pd(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r21,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_sub_pd(rinv21,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx21);
+            ty               = _mm256_mul_pd(fscal,dy21);
+            tz               = _mm256_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm256_mul_pd(rsq22,rinv22);
+            r22              = _mm256_andnot_pd(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r22,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_sub_pd(rinv22,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx22);
+            ty               = _mm256_mul_pd(fscal,dy22);
+            tz               = _mm256_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            }
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                      fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 459 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*459);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_avx_256_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_avx_256_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    real *           vdwioffsetptr1;
+    real *           vdwgridioffsetptr1;
+    __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    real *           vdwioffsetptr2;
+    real *           vdwgridioffsetptr2;
+    __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m256d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m256d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
+    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
+    __m256d           c6grid_00;
+    __m256d           c6grid_01;
+    __m256d           c6grid_02;
+    __m256d           c6grid_10;
+    __m256d           c6grid_11;
+    __m256d           c6grid_12;
+    __m256d           c6grid_20;
+    __m256d           c6grid_21;
+    __m256d           c6grid_22;
+    real             *vdwgridparam;
+    __m256d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256d           one_half  = _mm256_set1_pd(0.5);
+    __m256d           minus_one = _mm256_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m256d          dummy_mask,cutoff_mask;
+    __m128           tmpmask0,tmpmask1;
+    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
+    __m256d          one     = _mm256_set1_pd(1.0);
+    __m256d          two     = _mm256_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm256_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
+    beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
+    beta2            = _mm256_mul_pd(beta,beta);
+    beta3            = _mm256_mul_pd(beta,beta2);
+
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
+    iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
+    iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
+    vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+    vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm256_set1_pd(charge[inr+0]);
+    jq1              = _mm256_set1_pd(charge[inr+1]);
+    jq2              = _mm256_set1_pd(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm256_mul_pd(iq0,jq0);
+    c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
+    c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
+    c6grid_00        = _mm256_set1_pd(vdwgridioffsetptr0[vdwjidx0A]);
+    qq01             = _mm256_mul_pd(iq0,jq1);
+    qq02             = _mm256_mul_pd(iq0,jq2);
+    qq10             = _mm256_mul_pd(iq1,jq0);
+    qq11             = _mm256_mul_pd(iq1,jq1);
+    qq12             = _mm256_mul_pd(iq1,jq2);
+    qq20             = _mm256_mul_pd(iq2,jq0);
+    qq21             = _mm256_mul_pd(iq2,jq1);
+    qq22             = _mm256_mul_pd(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm256_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm256_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm256_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                    &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm256_setzero_pd();
+        fiy0             = _mm256_setzero_pd();
+        fiz0             = _mm256_setzero_pd();
+        fix1             = _mm256_setzero_pd();
+        fiy1             = _mm256_setzero_pd();
+        fiz1             = _mm256_setzero_pd();
+        fix2             = _mm256_setzero_pd();
+        fiy2             = _mm256_setzero_pd();
+        fiz2             = _mm256_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+            dx01             = _mm256_sub_pd(ix0,jx1);
+            dy01             = _mm256_sub_pd(iy0,jy1);
+            dz01             = _mm256_sub_pd(iz0,jz1);
+            dx02             = _mm256_sub_pd(ix0,jx2);
+            dy02             = _mm256_sub_pd(iy0,jy2);
+            dz02             = _mm256_sub_pd(iz0,jz2);
+            dx10             = _mm256_sub_pd(ix1,jx0);
+            dy10             = _mm256_sub_pd(iy1,jy0);
+            dz10             = _mm256_sub_pd(iz1,jz0);
+            dx11             = _mm256_sub_pd(ix1,jx1);
+            dy11             = _mm256_sub_pd(iy1,jy1);
+            dz11             = _mm256_sub_pd(iz1,jz1);
+            dx12             = _mm256_sub_pd(ix1,jx2);
+            dy12             = _mm256_sub_pd(iy1,jy2);
+            dz12             = _mm256_sub_pd(iz1,jz2);
+            dx20             = _mm256_sub_pd(ix2,jx0);
+            dy20             = _mm256_sub_pd(iy2,jy0);
+            dz20             = _mm256_sub_pd(iz2,jz0);
+            dx21             = _mm256_sub_pd(ix2,jx1);
+            dy21             = _mm256_sub_pd(iy2,jy1);
+            dz21             = _mm256_sub_pd(iz2,jz1);
+            dx22             = _mm256_sub_pd(ix2,jx2);
+            dy22             = _mm256_sub_pd(iy2,jy2);
+            dz22             = _mm256_sub_pd(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+            rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
+            rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
+            rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
+            rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
+            rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
+            rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+            rinv01           = gmx_mm256_invsqrt_pd(rsq01);
+            rinv02           = gmx_mm256_invsqrt_pd(rsq02);
+            rinv10           = gmx_mm256_invsqrt_pd(rsq10);
+            rinv11           = gmx_mm256_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm256_invsqrt_pd(rsq12);
+            rinv20           = gmx_mm256_invsqrt_pd(rsq20);
+            rinv21           = gmx_mm256_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm256_invsqrt_pd(rsq22);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+            rinvsq01         = _mm256_mul_pd(rinv01,rinv01);
+            rinvsq02         = _mm256_mul_pd(rinv02,rinv02);
+            rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
+            rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
+            rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
+            rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
+
+            fjx0             = _mm256_setzero_pd();
+            fjy0             = _mm256_setzero_pd();
+            fjz0             = _mm256_setzero_pd();
+            fjx1             = _mm256_setzero_pd();
+            fjy1             = _mm256_setzero_pd();
+            fjz1             = _mm256_setzero_pd();
+            fjx2             = _mm256_setzero_pd();
+            fjy2             = _mm256_setzero_pd();
+            fjz2             = _mm256_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r00,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = _mm256_add_pd(felec,fvdw);
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm256_mul_pd(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r01,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx01);
+            ty               = _mm256_mul_pd(fscal,dy01);
+            tz               = _mm256_mul_pd(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm256_mul_pd(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r02,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx02);
+            ty               = _mm256_mul_pd(fscal,dy02);
+            tz               = _mm256_mul_pd(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm256_mul_pd(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r10,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx10);
+            ty               = _mm256_mul_pd(fscal,dy10);
+            tz               = _mm256_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm256_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r11,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx11);
+            ty               = _mm256_mul_pd(fscal,dy11);
+            tz               = _mm256_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm256_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r12,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx12);
+            ty               = _mm256_mul_pd(fscal,dy12);
+            tz               = _mm256_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm256_mul_pd(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r20,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx20);
+            ty               = _mm256_mul_pd(fscal,dy20);
+            tz               = _mm256_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm256_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r21,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx21);
+            ty               = _mm256_mul_pd(fscal,dy21);
+            tz               = _mm256_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm256_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r22,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx22);
+            ty               = _mm256_mul_pd(fscal,dy22);
+            tz               = _mm256_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            }
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                      fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 374 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
+             */
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+
+            tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
+            tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
+            dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
+
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+            dx01             = _mm256_sub_pd(ix0,jx1);
+            dy01             = _mm256_sub_pd(iy0,jy1);
+            dz01             = _mm256_sub_pd(iz0,jz1);
+            dx02             = _mm256_sub_pd(ix0,jx2);
+            dy02             = _mm256_sub_pd(iy0,jy2);
+            dz02             = _mm256_sub_pd(iz0,jz2);
+            dx10             = _mm256_sub_pd(ix1,jx0);
+            dy10             = _mm256_sub_pd(iy1,jy0);
+            dz10             = _mm256_sub_pd(iz1,jz0);
+            dx11             = _mm256_sub_pd(ix1,jx1);
+            dy11             = _mm256_sub_pd(iy1,jy1);
+            dz11             = _mm256_sub_pd(iz1,jz1);
+            dx12             = _mm256_sub_pd(ix1,jx2);
+            dy12             = _mm256_sub_pd(iy1,jy2);
+            dz12             = _mm256_sub_pd(iz1,jz2);
+            dx20             = _mm256_sub_pd(ix2,jx0);
+            dy20             = _mm256_sub_pd(iy2,jy0);
+            dz20             = _mm256_sub_pd(iz2,jz0);
+            dx21             = _mm256_sub_pd(ix2,jx1);
+            dy21             = _mm256_sub_pd(iy2,jy1);
+            dz21             = _mm256_sub_pd(iz2,jz1);
+            dx22             = _mm256_sub_pd(ix2,jx2);
+            dy22             = _mm256_sub_pd(iy2,jy2);
+            dz22             = _mm256_sub_pd(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+            rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
+            rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
+            rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
+            rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
+            rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
+            rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+            rinv01           = gmx_mm256_invsqrt_pd(rsq01);
+            rinv02           = gmx_mm256_invsqrt_pd(rsq02);
+            rinv10           = gmx_mm256_invsqrt_pd(rsq10);
+            rinv11           = gmx_mm256_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm256_invsqrt_pd(rsq12);
+            rinv20           = gmx_mm256_invsqrt_pd(rsq20);
+            rinv21           = gmx_mm256_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm256_invsqrt_pd(rsq22);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+            rinvsq01         = _mm256_mul_pd(rinv01,rinv01);
+            rinvsq02         = _mm256_mul_pd(rinv02,rinv02);
+            rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
+            rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
+            rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
+            rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
+
+            fjx0             = _mm256_setzero_pd();
+            fjy0             = _mm256_setzero_pd();
+            fjz0             = _mm256_setzero_pd();
+            fjx1             = _mm256_setzero_pd();
+            fjy1             = _mm256_setzero_pd();
+            fjz1             = _mm256_setzero_pd();
+            fjx2             = _mm256_setzero_pd();
+            fjy2             = _mm256_setzero_pd();
+            fjz2             = _mm256_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+            r00              = _mm256_andnot_pd(dummy_mask,r00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r00,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = _mm256_add_pd(felec,fvdw);
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm256_mul_pd(rsq01,rinv01);
+            r01              = _mm256_andnot_pd(dummy_mask,r01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r01,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx01);
+            ty               = _mm256_mul_pd(fscal,dy01);
+            tz               = _mm256_mul_pd(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm256_mul_pd(rsq02,rinv02);
+            r02              = _mm256_andnot_pd(dummy_mask,r02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r02,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx02);
+            ty               = _mm256_mul_pd(fscal,dy02);
+            tz               = _mm256_mul_pd(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm256_mul_pd(rsq10,rinv10);
+            r10              = _mm256_andnot_pd(dummy_mask,r10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r10,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx10);
+            ty               = _mm256_mul_pd(fscal,dy10);
+            tz               = _mm256_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm256_mul_pd(rsq11,rinv11);
+            r11              = _mm256_andnot_pd(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r11,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx11);
+            ty               = _mm256_mul_pd(fscal,dy11);
+            tz               = _mm256_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm256_mul_pd(rsq12,rinv12);
+            r12              = _mm256_andnot_pd(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r12,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx12);
+            ty               = _mm256_mul_pd(fscal,dy12);
+            tz               = _mm256_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm256_mul_pd(rsq20,rinv20);
+            r20              = _mm256_andnot_pd(dummy_mask,r20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r20,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx20);
+            ty               = _mm256_mul_pd(fscal,dy20);
+            tz               = _mm256_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm256_mul_pd(rsq21,rinv21);
+            r21              = _mm256_andnot_pd(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r21,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx21);
+            ty               = _mm256_mul_pd(fscal,dy21);
+            tz               = _mm256_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm256_mul_pd(rsq22,rinv22);
+            r22              = _mm256_andnot_pd(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r22,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx22);
+            ty               = _mm256_mul_pd(fscal,dy22);
+            tz               = _mm256_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            }
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                      fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 383 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*383);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_avx_256_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_avx_256_double.c
new file mode 100644 (file)
index 0000000..2e555c7
--- /dev/null
@@ -0,0 +1,1616 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_256_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_256_double.h"
+#include "kernelutil_x86_avx_256_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_avx_256_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_avx_256_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    real *           vdwioffsetptr1;
+    real *           vdwgridioffsetptr1;
+    __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    real *           vdwioffsetptr2;
+    real *           vdwgridioffsetptr2;
+    __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    real *           vdwioffsetptr3;
+    real *           vdwgridioffsetptr3;
+    __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m256d          dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
+    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
+    __m256d           c6grid_00;
+    __m256d           c6grid_10;
+    __m256d           c6grid_20;
+    __m256d           c6grid_30;
+    real             *vdwgridparam;
+    __m256d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256d           one_half  = _mm256_set1_pd(0.5);
+    __m256d           minus_one = _mm256_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m256d          dummy_mask,cutoff_mask;
+    __m128           tmpmask0,tmpmask1;
+    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
+    __m256d          one     = _mm256_set1_pd(1.0);
+    __m256d          two     = _mm256_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm256_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
+    beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
+    beta2            = _mm256_mul_pd(beta,beta);
+    beta3            = _mm256_mul_pd(beta,beta2);
+
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
+    iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
+    iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
+    vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+    vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm256_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm256_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm256_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                    &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm256_setzero_pd();
+        fiy0             = _mm256_setzero_pd();
+        fiz0             = _mm256_setzero_pd();
+        fix1             = _mm256_setzero_pd();
+        fiy1             = _mm256_setzero_pd();
+        fiz1             = _mm256_setzero_pd();
+        fix2             = _mm256_setzero_pd();
+        fiy2             = _mm256_setzero_pd();
+        fiz2             = _mm256_setzero_pd();
+        fix3             = _mm256_setzero_pd();
+        fiy3             = _mm256_setzero_pd();
+        fiz3             = _mm256_setzero_pd();
+
+        /* Reset potential sums */
+        velecsum         = _mm256_setzero_pd();
+        vvdwsum          = _mm256_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+            dx10             = _mm256_sub_pd(ix1,jx0);
+            dy10             = _mm256_sub_pd(iy1,jy0);
+            dz10             = _mm256_sub_pd(iz1,jz0);
+            dx20             = _mm256_sub_pd(ix2,jx0);
+            dy20             = _mm256_sub_pd(iy2,jy0);
+            dz20             = _mm256_sub_pd(iz2,jz0);
+            dx30             = _mm256_sub_pd(ix3,jx0);
+            dy30             = _mm256_sub_pd(iy3,jy0);
+            dz30             = _mm256_sub_pd(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
+            rsq30            = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm256_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm256_invsqrt_pd(rsq20);
+            rinv30           = gmx_mm256_invsqrt_pd(rsq30);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
+            rinvsq30         = _mm256_mul_pd(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm256_setzero_pd();
+            fjy0             = _mm256_setzero_pd();
+            fjz0             = _mm256_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_add_pd(_mm256_mul_pd(c6_00,sh_vdw_invrcut6),_mm256_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
+            vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm256_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm256_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r10,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_sub_pd(rinv10,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx10);
+            ty               = _mm256_mul_pd(fscal,dy10);
+            tz               = _mm256_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm256_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm256_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r20,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_sub_pd(rinv20,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx20);
+            ty               = _mm256_mul_pd(fscal,dy20);
+            tz               = _mm256_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm256_mul_pd(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm256_mul_pd(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r30,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq30,_mm256_sub_pd(_mm256_sub_pd(rinv30,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq30,rinv30),_mm256_sub_pd(rinvsq30,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq30,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx30);
+            ty               = _mm256_mul_pd(fscal,dy30);
+            tz               = _mm256_mul_pd(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_pd(fix3,tx);
+            fiy3             = _mm256_add_pd(fiy3,ty);
+            fiz3             = _mm256_add_pd(fiz3,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 203 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
+             */
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+
+            tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
+            tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
+            dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
+
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+            dx10             = _mm256_sub_pd(ix1,jx0);
+            dy10             = _mm256_sub_pd(iy1,jy0);
+            dz10             = _mm256_sub_pd(iz1,jz0);
+            dx20             = _mm256_sub_pd(ix2,jx0);
+            dy20             = _mm256_sub_pd(iy2,jy0);
+            dz20             = _mm256_sub_pd(iz2,jz0);
+            dx30             = _mm256_sub_pd(ix3,jx0);
+            dy30             = _mm256_sub_pd(iy3,jy0);
+            dz30             = _mm256_sub_pd(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
+            rsq30            = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm256_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm256_invsqrt_pd(rsq20);
+            rinv30           = gmx_mm256_invsqrt_pd(rsq30);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
+            rinvsq30         = _mm256_mul_pd(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm256_setzero_pd();
+            fjy0             = _mm256_setzero_pd();
+            fjz0             = _mm256_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+            r00              = _mm256_andnot_pd(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_add_pd(_mm256_mul_pd(c6_00,sh_vdw_invrcut6),_mm256_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
+            vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
+            vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm256_mul_pd(rsq10,rinv10);
+            r10              = _mm256_andnot_pd(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm256_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r10,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_sub_pd(rinv10,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx10);
+            ty               = _mm256_mul_pd(fscal,dy10);
+            tz               = _mm256_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm256_mul_pd(rsq20,rinv20);
+            r20              = _mm256_andnot_pd(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm256_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r20,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_sub_pd(rinv20,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx20);
+            ty               = _mm256_mul_pd(fscal,dy20);
+            tz               = _mm256_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm256_mul_pd(rsq30,rinv30);
+            r30              = _mm256_andnot_pd(dummy_mask,r30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm256_mul_pd(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r30,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq30,_mm256_sub_pd(_mm256_sub_pd(rinv30,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq30,rinv30),_mm256_sub_pd(rinvsq30,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq30,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx30);
+            ty               = _mm256_mul_pd(fscal,dy30);
+            tz               = _mm256_mul_pd(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_pd(fix3,tx);
+            fiy3             = _mm256_add_pd(fiy3,ty);
+            fiz3             = _mm256_add_pd(fiz3,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 207 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*207);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_avx_256_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_avx_256_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    real *           vdwioffsetptr1;
+    real *           vdwgridioffsetptr1;
+    __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    real *           vdwioffsetptr2;
+    real *           vdwgridioffsetptr2;
+    __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    real *           vdwioffsetptr3;
+    real *           vdwgridioffsetptr3;
+    __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m256d          dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
+    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
+    __m256d           c6grid_00;
+    __m256d           c6grid_10;
+    __m256d           c6grid_20;
+    __m256d           c6grid_30;
+    real             *vdwgridparam;
+    __m256d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256d           one_half  = _mm256_set1_pd(0.5);
+    __m256d           minus_one = _mm256_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m256d          dummy_mask,cutoff_mask;
+    __m128           tmpmask0,tmpmask1;
+    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
+    __m256d          one     = _mm256_set1_pd(1.0);
+    __m256d          two     = _mm256_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm256_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
+    beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
+    beta2            = _mm256_mul_pd(beta,beta);
+    beta3            = _mm256_mul_pd(beta,beta2);
+
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
+    iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
+    iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
+    vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+    vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm256_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm256_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm256_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                    &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm256_setzero_pd();
+        fiy0             = _mm256_setzero_pd();
+        fiz0             = _mm256_setzero_pd();
+        fix1             = _mm256_setzero_pd();
+        fiy1             = _mm256_setzero_pd();
+        fiz1             = _mm256_setzero_pd();
+        fix2             = _mm256_setzero_pd();
+        fiy2             = _mm256_setzero_pd();
+        fiz2             = _mm256_setzero_pd();
+        fix3             = _mm256_setzero_pd();
+        fiy3             = _mm256_setzero_pd();
+        fiz3             = _mm256_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+            dx10             = _mm256_sub_pd(ix1,jx0);
+            dy10             = _mm256_sub_pd(iy1,jy0);
+            dz10             = _mm256_sub_pd(iz1,jz0);
+            dx20             = _mm256_sub_pd(ix2,jx0);
+            dy20             = _mm256_sub_pd(iy2,jy0);
+            dz20             = _mm256_sub_pd(iz2,jz0);
+            dx30             = _mm256_sub_pd(ix3,jx0);
+            dy30             = _mm256_sub_pd(iy3,jy0);
+            dz30             = _mm256_sub_pd(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
+            rsq30            = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm256_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm256_invsqrt_pd(rsq20);
+            rinv30           = gmx_mm256_invsqrt_pd(rsq30);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
+            rinvsq30         = _mm256_mul_pd(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm256_setzero_pd();
+            fjy0             = _mm256_setzero_pd();
+            fjz0             = _mm256_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = fvdw;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm256_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm256_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r10,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx10);
+            ty               = _mm256_mul_pd(fscal,dy10);
+            tz               = _mm256_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm256_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm256_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r20,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx20);
+            ty               = _mm256_mul_pd(fscal,dy20);
+            tz               = _mm256_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm256_mul_pd(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm256_mul_pd(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r30,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq30,rinv30),_mm256_sub_pd(rinvsq30,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq30,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx30);
+            ty               = _mm256_mul_pd(fscal,dy30);
+            tz               = _mm256_mul_pd(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_pd(fix3,tx);
+            fiy3             = _mm256_add_pd(fiy3,ty);
+            fiz3             = _mm256_add_pd(fiz3,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 169 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
+             */
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+
+            tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
+            tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
+            dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
+
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+            dx10             = _mm256_sub_pd(ix1,jx0);
+            dy10             = _mm256_sub_pd(iy1,jy0);
+            dz10             = _mm256_sub_pd(iz1,jz0);
+            dx20             = _mm256_sub_pd(ix2,jx0);
+            dy20             = _mm256_sub_pd(iy2,jy0);
+            dz20             = _mm256_sub_pd(iz2,jz0);
+            dx30             = _mm256_sub_pd(ix3,jx0);
+            dy30             = _mm256_sub_pd(iy3,jy0);
+            dz30             = _mm256_sub_pd(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
+            rsq30            = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm256_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm256_invsqrt_pd(rsq20);
+            rinv30           = gmx_mm256_invsqrt_pd(rsq30);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
+            rinvsq30         = _mm256_mul_pd(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm256_setzero_pd();
+            fjy0             = _mm256_setzero_pd();
+            fjz0             = _mm256_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+            r00              = _mm256_andnot_pd(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = fvdw;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm256_mul_pd(rsq10,rinv10);
+            r10              = _mm256_andnot_pd(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm256_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r10,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx10);
+            ty               = _mm256_mul_pd(fscal,dy10);
+            tz               = _mm256_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm256_mul_pd(rsq20,rinv20);
+            r20              = _mm256_andnot_pd(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm256_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r20,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx20);
+            ty               = _mm256_mul_pd(fscal,dy20);
+            tz               = _mm256_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm256_mul_pd(rsq30,rinv30);
+            r30              = _mm256_andnot_pd(dummy_mask,r30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm256_mul_pd(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r30,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq30,rinv30),_mm256_sub_pd(rinvsq30,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq30,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx30);
+            ty               = _mm256_mul_pd(fscal,dy30);
+            tz               = _mm256_mul_pd(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_pd(fix3,tx);
+            fiy3             = _mm256_add_pd(fiy3,ty);
+            fiz3             = _mm256_add_pd(fiz3,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 173 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*173);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_avx_256_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_avx_256_double.c
new file mode 100644 (file)
index 0000000..b586343
--- /dev/null
@@ -0,0 +1,2944 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_256_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_256_double.h"
+#include "kernelutil_x86_avx_256_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_avx_256_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_avx_256_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    real *           vdwioffsetptr1;
+    real *           vdwgridioffsetptr1;
+    __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    real *           vdwioffsetptr2;
+    real *           vdwgridioffsetptr2;
+    __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    real *           vdwioffsetptr3;
+    real *           vdwgridioffsetptr3;
+    __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m256d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m256d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m256d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m256d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m256d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m256d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
+    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
+    __m256d           c6grid_00;
+    __m256d           c6grid_11;
+    __m256d           c6grid_12;
+    __m256d           c6grid_13;
+    __m256d           c6grid_21;
+    __m256d           c6grid_22;
+    __m256d           c6grid_23;
+    __m256d           c6grid_31;
+    __m256d           c6grid_32;
+    __m256d           c6grid_33;
+    real             *vdwgridparam;
+    __m256d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256d           one_half  = _mm256_set1_pd(0.5);
+    __m256d           minus_one = _mm256_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m256d          dummy_mask,cutoff_mask;
+    __m128           tmpmask0,tmpmask1;
+    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
+    __m256d          one     = _mm256_set1_pd(1.0);
+    __m256d          two     = _mm256_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm256_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
+    beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
+    beta2            = _mm256_mul_pd(beta,beta);
+    beta3            = _mm256_mul_pd(beta,beta2);
+
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
+    iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
+    iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
+    vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+    vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm256_set1_pd(charge[inr+1]);
+    jq2              = _mm256_set1_pd(charge[inr+2]);
+    jq3              = _mm256_set1_pd(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
+    c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
+    c6grid_00        = _mm256_set1_pd(vdwgridioffsetptr0[vdwjidx0A]);
+    qq11             = _mm256_mul_pd(iq1,jq1);
+    qq12             = _mm256_mul_pd(iq1,jq2);
+    qq13             = _mm256_mul_pd(iq1,jq3);
+    qq21             = _mm256_mul_pd(iq2,jq1);
+    qq22             = _mm256_mul_pd(iq2,jq2);
+    qq23             = _mm256_mul_pd(iq2,jq3);
+    qq31             = _mm256_mul_pd(iq3,jq1);
+    qq32             = _mm256_mul_pd(iq3,jq2);
+    qq33             = _mm256_mul_pd(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm256_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm256_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm256_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                    &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm256_setzero_pd();
+        fiy0             = _mm256_setzero_pd();
+        fiz0             = _mm256_setzero_pd();
+        fix1             = _mm256_setzero_pd();
+        fiy1             = _mm256_setzero_pd();
+        fiz1             = _mm256_setzero_pd();
+        fix2             = _mm256_setzero_pd();
+        fiy2             = _mm256_setzero_pd();
+        fiz2             = _mm256_setzero_pd();
+        fix3             = _mm256_setzero_pd();
+        fiy3             = _mm256_setzero_pd();
+        fiz3             = _mm256_setzero_pd();
+
+        /* Reset potential sums */
+        velecsum         = _mm256_setzero_pd();
+        vvdwsum          = _mm256_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                                 &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+            dx11             = _mm256_sub_pd(ix1,jx1);
+            dy11             = _mm256_sub_pd(iy1,jy1);
+            dz11             = _mm256_sub_pd(iz1,jz1);
+            dx12             = _mm256_sub_pd(ix1,jx2);
+            dy12             = _mm256_sub_pd(iy1,jy2);
+            dz12             = _mm256_sub_pd(iz1,jz2);
+            dx13             = _mm256_sub_pd(ix1,jx3);
+            dy13             = _mm256_sub_pd(iy1,jy3);
+            dz13             = _mm256_sub_pd(iz1,jz3);
+            dx21             = _mm256_sub_pd(ix2,jx1);
+            dy21             = _mm256_sub_pd(iy2,jy1);
+            dz21             = _mm256_sub_pd(iz2,jz1);
+            dx22             = _mm256_sub_pd(ix2,jx2);
+            dy22             = _mm256_sub_pd(iy2,jy2);
+            dz22             = _mm256_sub_pd(iz2,jz2);
+            dx23             = _mm256_sub_pd(ix2,jx3);
+            dy23             = _mm256_sub_pd(iy2,jy3);
+            dz23             = _mm256_sub_pd(iz2,jz3);
+            dx31             = _mm256_sub_pd(ix3,jx1);
+            dy31             = _mm256_sub_pd(iy3,jy1);
+            dz31             = _mm256_sub_pd(iz3,jz1);
+            dx32             = _mm256_sub_pd(ix3,jx2);
+            dy32             = _mm256_sub_pd(iy3,jy2);
+            dz32             = _mm256_sub_pd(iz3,jz2);
+            dx33             = _mm256_sub_pd(ix3,jx3);
+            dy33             = _mm256_sub_pd(iy3,jy3);
+            dz33             = _mm256_sub_pd(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+            rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
+            rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
+            rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
+            rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
+            rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
+            rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
+            rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+            rinv11           = gmx_mm256_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm256_invsqrt_pd(rsq12);
+            rinv13           = gmx_mm256_invsqrt_pd(rsq13);
+            rinv21           = gmx_mm256_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm256_invsqrt_pd(rsq22);
+            rinv23           = gmx_mm256_invsqrt_pd(rsq23);
+            rinv31           = gmx_mm256_invsqrt_pd(rsq31);
+            rinv32           = gmx_mm256_invsqrt_pd(rsq32);
+            rinv33           = gmx_mm256_invsqrt_pd(rsq33);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+            rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
+            rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
+            rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
+            rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
+            rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
+            rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
+            rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
+
+            fjx0             = _mm256_setzero_pd();
+            fjy0             = _mm256_setzero_pd();
+            fjz0             = _mm256_setzero_pd();
+            fjx1             = _mm256_setzero_pd();
+            fjy1             = _mm256_setzero_pd();
+            fjz1             = _mm256_setzero_pd();
+            fjx2             = _mm256_setzero_pd();
+            fjy2             = _mm256_setzero_pd();
+            fjz2             = _mm256_setzero_pd();
+            fjx3             = _mm256_setzero_pd();
+            fjy3             = _mm256_setzero_pd();
+            fjz3             = _mm256_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_add_pd(_mm256_mul_pd(c6_00,sh_vdw_invrcut6),_mm256_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
+            vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm256_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r11,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_sub_pd(rinv11,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx11);
+            ty               = _mm256_mul_pd(fscal,dy11);
+            tz               = _mm256_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm256_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r12,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_sub_pd(rinv12,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx12);
+            ty               = _mm256_mul_pd(fscal,dy12);
+            tz               = _mm256_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm256_mul_pd(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r13,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_sub_pd(rinv13,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx13);
+            ty               = _mm256_mul_pd(fscal,dy13);
+            tz               = _mm256_mul_pd(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx3             = _mm256_add_pd(fjx3,tx);
+            fjy3             = _mm256_add_pd(fjy3,ty);
+            fjz3             = _mm256_add_pd(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm256_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r21,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_sub_pd(rinv21,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx21);
+            ty               = _mm256_mul_pd(fscal,dy21);
+            tz               = _mm256_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm256_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r22,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_sub_pd(rinv22,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx22);
+            ty               = _mm256_mul_pd(fscal,dy22);
+            tz               = _mm256_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm256_mul_pd(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r23,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_sub_pd(rinv23,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx23);
+            ty               = _mm256_mul_pd(fscal,dy23);
+            tz               = _mm256_mul_pd(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx3             = _mm256_add_pd(fjx3,tx);
+            fjy3             = _mm256_add_pd(fjy3,ty);
+            fjz3             = _mm256_add_pd(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm256_mul_pd(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r31,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_sub_pd(rinv31,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx31);
+            ty               = _mm256_mul_pd(fscal,dy31);
+            tz               = _mm256_mul_pd(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_pd(fix3,tx);
+            fiy3             = _mm256_add_pd(fiy3,ty);
+            fiz3             = _mm256_add_pd(fiz3,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm256_mul_pd(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r32,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_sub_pd(rinv32,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx32);
+            ty               = _mm256_mul_pd(fscal,dy32);
+            tz               = _mm256_mul_pd(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_pd(fix3,tx);
+            fiy3             = _mm256_add_pd(fiy3,ty);
+            fiz3             = _mm256_add_pd(fiz3,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm256_mul_pd(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r33,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_sub_pd(rinv33,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx33);
+            ty               = _mm256_mul_pd(fscal,dy33);
+            tz               = _mm256_mul_pd(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_pd(fix3,tx);
+            fiy3             = _mm256_add_pd(fiy3,ty);
+            fiz3             = _mm256_add_pd(fiz3,tz);
+
+            fjx3             = _mm256_add_pd(fjx3,tx);
+            fjy3             = _mm256_add_pd(fjy3,ty);
+            fjz3             = _mm256_add_pd(fjz3,tz);
+
+            }
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                      fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                      fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 479 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
+             */
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+
+            tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
+            tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
+            dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
+
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                                 &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+            dx11             = _mm256_sub_pd(ix1,jx1);
+            dy11             = _mm256_sub_pd(iy1,jy1);
+            dz11             = _mm256_sub_pd(iz1,jz1);
+            dx12             = _mm256_sub_pd(ix1,jx2);
+            dy12             = _mm256_sub_pd(iy1,jy2);
+            dz12             = _mm256_sub_pd(iz1,jz2);
+            dx13             = _mm256_sub_pd(ix1,jx3);
+            dy13             = _mm256_sub_pd(iy1,jy3);
+            dz13             = _mm256_sub_pd(iz1,jz3);
+            dx21             = _mm256_sub_pd(ix2,jx1);
+            dy21             = _mm256_sub_pd(iy2,jy1);
+            dz21             = _mm256_sub_pd(iz2,jz1);
+            dx22             = _mm256_sub_pd(ix2,jx2);
+            dy22             = _mm256_sub_pd(iy2,jy2);
+            dz22             = _mm256_sub_pd(iz2,jz2);
+            dx23             = _mm256_sub_pd(ix2,jx3);
+            dy23             = _mm256_sub_pd(iy2,jy3);
+            dz23             = _mm256_sub_pd(iz2,jz3);
+            dx31             = _mm256_sub_pd(ix3,jx1);
+            dy31             = _mm256_sub_pd(iy3,jy1);
+            dz31             = _mm256_sub_pd(iz3,jz1);
+            dx32             = _mm256_sub_pd(ix3,jx2);
+            dy32             = _mm256_sub_pd(iy3,jy2);
+            dz32             = _mm256_sub_pd(iz3,jz2);
+            dx33             = _mm256_sub_pd(ix3,jx3);
+            dy33             = _mm256_sub_pd(iy3,jy3);
+            dz33             = _mm256_sub_pd(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+            rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
+            rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
+            rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
+            rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
+            rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
+            rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
+            rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+            rinv11           = gmx_mm256_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm256_invsqrt_pd(rsq12);
+            rinv13           = gmx_mm256_invsqrt_pd(rsq13);
+            rinv21           = gmx_mm256_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm256_invsqrt_pd(rsq22);
+            rinv23           = gmx_mm256_invsqrt_pd(rsq23);
+            rinv31           = gmx_mm256_invsqrt_pd(rsq31);
+            rinv32           = gmx_mm256_invsqrt_pd(rsq32);
+            rinv33           = gmx_mm256_invsqrt_pd(rsq33);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+            rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
+            rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
+            rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
+            rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
+            rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
+            rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
+            rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
+
+            fjx0             = _mm256_setzero_pd();
+            fjy0             = _mm256_setzero_pd();
+            fjz0             = _mm256_setzero_pd();
+            fjx1             = _mm256_setzero_pd();
+            fjy1             = _mm256_setzero_pd();
+            fjz1             = _mm256_setzero_pd();
+            fjx2             = _mm256_setzero_pd();
+            fjy2             = _mm256_setzero_pd();
+            fjz2             = _mm256_setzero_pd();
+            fjx3             = _mm256_setzero_pd();
+            fjy3             = _mm256_setzero_pd();
+            fjz3             = _mm256_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+            r00              = _mm256_andnot_pd(dummy_mask,r00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_add_pd(_mm256_mul_pd(c6_00,sh_vdw_invrcut6),_mm256_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
+            vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
+            vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm256_mul_pd(rsq11,rinv11);
+            r11              = _mm256_andnot_pd(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r11,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_sub_pd(rinv11,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx11);
+            ty               = _mm256_mul_pd(fscal,dy11);
+            tz               = _mm256_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm256_mul_pd(rsq12,rinv12);
+            r12              = _mm256_andnot_pd(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r12,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_sub_pd(rinv12,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx12);
+            ty               = _mm256_mul_pd(fscal,dy12);
+            tz               = _mm256_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm256_mul_pd(rsq13,rinv13);
+            r13              = _mm256_andnot_pd(dummy_mask,r13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r13,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_sub_pd(rinv13,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx13);
+            ty               = _mm256_mul_pd(fscal,dy13);
+            tz               = _mm256_mul_pd(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx3             = _mm256_add_pd(fjx3,tx);
+            fjy3             = _mm256_add_pd(fjy3,ty);
+            fjz3             = _mm256_add_pd(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm256_mul_pd(rsq21,rinv21);
+            r21              = _mm256_andnot_pd(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r21,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_sub_pd(rinv21,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx21);
+            ty               = _mm256_mul_pd(fscal,dy21);
+            tz               = _mm256_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm256_mul_pd(rsq22,rinv22);
+            r22              = _mm256_andnot_pd(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r22,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_sub_pd(rinv22,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx22);
+            ty               = _mm256_mul_pd(fscal,dy22);
+            tz               = _mm256_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm256_mul_pd(rsq23,rinv23);
+            r23              = _mm256_andnot_pd(dummy_mask,r23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r23,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_sub_pd(rinv23,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx23);
+            ty               = _mm256_mul_pd(fscal,dy23);
+            tz               = _mm256_mul_pd(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx3             = _mm256_add_pd(fjx3,tx);
+            fjy3             = _mm256_add_pd(fjy3,ty);
+            fjz3             = _mm256_add_pd(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm256_mul_pd(rsq31,rinv31);
+            r31              = _mm256_andnot_pd(dummy_mask,r31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r31,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_sub_pd(rinv31,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx31);
+            ty               = _mm256_mul_pd(fscal,dy31);
+            tz               = _mm256_mul_pd(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_pd(fix3,tx);
+            fiy3             = _mm256_add_pd(fiy3,ty);
+            fiz3             = _mm256_add_pd(fiz3,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm256_mul_pd(rsq32,rinv32);
+            r32              = _mm256_andnot_pd(dummy_mask,r32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r32,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_sub_pd(rinv32,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx32);
+            ty               = _mm256_mul_pd(fscal,dy32);
+            tz               = _mm256_mul_pd(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_pd(fix3,tx);
+            fiy3             = _mm256_add_pd(fiy3,ty);
+            fiz3             = _mm256_add_pd(fiz3,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm256_mul_pd(rsq33,rinv33);
+            r33              = _mm256_andnot_pd(dummy_mask,r33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r33,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_sub_pd(rinv33,sh_ewald),velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_pd(velec,cutoff_mask);
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx33);
+            ty               = _mm256_mul_pd(fscal,dy33);
+            tz               = _mm256_mul_pd(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_pd(fix3,tx);
+            fiy3             = _mm256_add_pd(fiy3,ty);
+            fiz3             = _mm256_add_pd(fiz3,tz);
+
+            fjx3             = _mm256_add_pd(fjx3,tx);
+            fjy3             = _mm256_add_pd(fjy3,ty);
+            fjz3             = _mm256_add_pd(fjz3,tz);
+
+            }
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                      fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                      fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 489 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*489);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_avx_256_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_avx_256_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    real *           vdwioffsetptr1;
+    real *           vdwgridioffsetptr1;
+    __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    real *           vdwioffsetptr2;
+    real *           vdwgridioffsetptr2;
+    __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    real *           vdwioffsetptr3;
+    real *           vdwgridioffsetptr3;
+    __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m256d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m256d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m256d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m256d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m256d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m256d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
+    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
+    __m256d           c6grid_00;
+    __m256d           c6grid_11;
+    __m256d           c6grid_12;
+    __m256d           c6grid_13;
+    __m256d           c6grid_21;
+    __m256d           c6grid_22;
+    __m256d           c6grid_23;
+    __m256d           c6grid_31;
+    __m256d           c6grid_32;
+    __m256d           c6grid_33;
+    real             *vdwgridparam;
+    __m256d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256d           one_half  = _mm256_set1_pd(0.5);
+    __m256d           minus_one = _mm256_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m256d          dummy_mask,cutoff_mask;
+    __m128           tmpmask0,tmpmask1;
+    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
+    __m256d          one     = _mm256_set1_pd(1.0);
+    __m256d          two     = _mm256_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm256_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
+    beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
+    beta2            = _mm256_mul_pd(beta,beta);
+    beta3            = _mm256_mul_pd(beta,beta2);
+
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
+    iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
+    iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
+    vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+    vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm256_set1_pd(charge[inr+1]);
+    jq2              = _mm256_set1_pd(charge[inr+2]);
+    jq3              = _mm256_set1_pd(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
+    c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
+    c6grid_00        = _mm256_set1_pd(vdwgridioffsetptr0[vdwjidx0A]);
+    qq11             = _mm256_mul_pd(iq1,jq1);
+    qq12             = _mm256_mul_pd(iq1,jq2);
+    qq13             = _mm256_mul_pd(iq1,jq3);
+    qq21             = _mm256_mul_pd(iq2,jq1);
+    qq22             = _mm256_mul_pd(iq2,jq2);
+    qq23             = _mm256_mul_pd(iq2,jq3);
+    qq31             = _mm256_mul_pd(iq3,jq1);
+    qq32             = _mm256_mul_pd(iq3,jq2);
+    qq33             = _mm256_mul_pd(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm256_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm256_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm256_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                    &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm256_setzero_pd();
+        fiy0             = _mm256_setzero_pd();
+        fiz0             = _mm256_setzero_pd();
+        fix1             = _mm256_setzero_pd();
+        fiy1             = _mm256_setzero_pd();
+        fiz1             = _mm256_setzero_pd();
+        fix2             = _mm256_setzero_pd();
+        fiy2             = _mm256_setzero_pd();
+        fiz2             = _mm256_setzero_pd();
+        fix3             = _mm256_setzero_pd();
+        fiy3             = _mm256_setzero_pd();
+        fiz3             = _mm256_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                                 &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+            dx11             = _mm256_sub_pd(ix1,jx1);
+            dy11             = _mm256_sub_pd(iy1,jy1);
+            dz11             = _mm256_sub_pd(iz1,jz1);
+            dx12             = _mm256_sub_pd(ix1,jx2);
+            dy12             = _mm256_sub_pd(iy1,jy2);
+            dz12             = _mm256_sub_pd(iz1,jz2);
+            dx13             = _mm256_sub_pd(ix1,jx3);
+            dy13             = _mm256_sub_pd(iy1,jy3);
+            dz13             = _mm256_sub_pd(iz1,jz3);
+            dx21             = _mm256_sub_pd(ix2,jx1);
+            dy21             = _mm256_sub_pd(iy2,jy1);
+            dz21             = _mm256_sub_pd(iz2,jz1);
+            dx22             = _mm256_sub_pd(ix2,jx2);
+            dy22             = _mm256_sub_pd(iy2,jy2);
+            dz22             = _mm256_sub_pd(iz2,jz2);
+            dx23             = _mm256_sub_pd(ix2,jx3);
+            dy23             = _mm256_sub_pd(iy2,jy3);
+            dz23             = _mm256_sub_pd(iz2,jz3);
+            dx31             = _mm256_sub_pd(ix3,jx1);
+            dy31             = _mm256_sub_pd(iy3,jy1);
+            dz31             = _mm256_sub_pd(iz3,jz1);
+            dx32             = _mm256_sub_pd(ix3,jx2);
+            dy32             = _mm256_sub_pd(iy3,jy2);
+            dz32             = _mm256_sub_pd(iz3,jz2);
+            dx33             = _mm256_sub_pd(ix3,jx3);
+            dy33             = _mm256_sub_pd(iy3,jy3);
+            dz33             = _mm256_sub_pd(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+            rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
+            rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
+            rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
+            rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
+            rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
+            rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
+            rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+            rinv11           = gmx_mm256_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm256_invsqrt_pd(rsq12);
+            rinv13           = gmx_mm256_invsqrt_pd(rsq13);
+            rinv21           = gmx_mm256_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm256_invsqrt_pd(rsq22);
+            rinv23           = gmx_mm256_invsqrt_pd(rsq23);
+            rinv31           = gmx_mm256_invsqrt_pd(rsq31);
+            rinv32           = gmx_mm256_invsqrt_pd(rsq32);
+            rinv33           = gmx_mm256_invsqrt_pd(rsq33);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+            rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
+            rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
+            rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
+            rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
+            rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
+            rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
+            rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
+
+            fjx0             = _mm256_setzero_pd();
+            fjy0             = _mm256_setzero_pd();
+            fjz0             = _mm256_setzero_pd();
+            fjx1             = _mm256_setzero_pd();
+            fjy1             = _mm256_setzero_pd();
+            fjz1             = _mm256_setzero_pd();
+            fjx2             = _mm256_setzero_pd();
+            fjy2             = _mm256_setzero_pd();
+            fjz2             = _mm256_setzero_pd();
+            fjx3             = _mm256_setzero_pd();
+            fjy3             = _mm256_setzero_pd();
+            fjz3             = _mm256_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = fvdw;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm256_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r11,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx11);
+            ty               = _mm256_mul_pd(fscal,dy11);
+            tz               = _mm256_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm256_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r12,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx12);
+            ty               = _mm256_mul_pd(fscal,dy12);
+            tz               = _mm256_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm256_mul_pd(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r13,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx13);
+            ty               = _mm256_mul_pd(fscal,dy13);
+            tz               = _mm256_mul_pd(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx3             = _mm256_add_pd(fjx3,tx);
+            fjy3             = _mm256_add_pd(fjy3,ty);
+            fjz3             = _mm256_add_pd(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm256_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r21,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx21);
+            ty               = _mm256_mul_pd(fscal,dy21);
+            tz               = _mm256_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm256_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r22,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx22);
+            ty               = _mm256_mul_pd(fscal,dy22);
+            tz               = _mm256_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm256_mul_pd(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r23,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx23);
+            ty               = _mm256_mul_pd(fscal,dy23);
+            tz               = _mm256_mul_pd(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx3             = _mm256_add_pd(fjx3,tx);
+            fjy3             = _mm256_add_pd(fjy3,ty);
+            fjz3             = _mm256_add_pd(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm256_mul_pd(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r31,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx31);
+            ty               = _mm256_mul_pd(fscal,dy31);
+            tz               = _mm256_mul_pd(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_pd(fix3,tx);
+            fiy3             = _mm256_add_pd(fiy3,ty);
+            fiz3             = _mm256_add_pd(fiz3,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm256_mul_pd(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r32,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx32);
+            ty               = _mm256_mul_pd(fscal,dy32);
+            tz               = _mm256_mul_pd(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_pd(fix3,tx);
+            fiy3             = _mm256_add_pd(fiy3,ty);
+            fiz3             = _mm256_add_pd(fiz3,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm256_mul_pd(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r33,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx33);
+            ty               = _mm256_mul_pd(fscal,dy33);
+            tz               = _mm256_mul_pd(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_pd(fix3,tx);
+            fiy3             = _mm256_add_pd(fiy3,ty);
+            fiz3             = _mm256_add_pd(fiz3,tz);
+
+            fjx3             = _mm256_add_pd(fjx3,tx);
+            fjy3             = _mm256_add_pd(fjy3,ty);
+            fjz3             = _mm256_add_pd(fjz3,tz);
+
+            }
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                      fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                      fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 403 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
+             */
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+
+            tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
+            tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
+            dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
+
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                                 &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+            dx11             = _mm256_sub_pd(ix1,jx1);
+            dy11             = _mm256_sub_pd(iy1,jy1);
+            dz11             = _mm256_sub_pd(iz1,jz1);
+            dx12             = _mm256_sub_pd(ix1,jx2);
+            dy12             = _mm256_sub_pd(iy1,jy2);
+            dz12             = _mm256_sub_pd(iz1,jz2);
+            dx13             = _mm256_sub_pd(ix1,jx3);
+            dy13             = _mm256_sub_pd(iy1,jy3);
+            dz13             = _mm256_sub_pd(iz1,jz3);
+            dx21             = _mm256_sub_pd(ix2,jx1);
+            dy21             = _mm256_sub_pd(iy2,jy1);
+            dz21             = _mm256_sub_pd(iz2,jz1);
+            dx22             = _mm256_sub_pd(ix2,jx2);
+            dy22             = _mm256_sub_pd(iy2,jy2);
+            dz22             = _mm256_sub_pd(iz2,jz2);
+            dx23             = _mm256_sub_pd(ix2,jx3);
+            dy23             = _mm256_sub_pd(iy2,jy3);
+            dz23             = _mm256_sub_pd(iz2,jz3);
+            dx31             = _mm256_sub_pd(ix3,jx1);
+            dy31             = _mm256_sub_pd(iy3,jy1);
+            dz31             = _mm256_sub_pd(iz3,jz1);
+            dx32             = _mm256_sub_pd(ix3,jx2);
+            dy32             = _mm256_sub_pd(iy3,jy2);
+            dz32             = _mm256_sub_pd(iz3,jz2);
+            dx33             = _mm256_sub_pd(ix3,jx3);
+            dy33             = _mm256_sub_pd(iy3,jy3);
+            dz33             = _mm256_sub_pd(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+            rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
+            rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
+            rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
+            rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
+            rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
+            rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
+            rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+            rinv11           = gmx_mm256_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm256_invsqrt_pd(rsq12);
+            rinv13           = gmx_mm256_invsqrt_pd(rsq13);
+            rinv21           = gmx_mm256_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm256_invsqrt_pd(rsq22);
+            rinv23           = gmx_mm256_invsqrt_pd(rsq23);
+            rinv31           = gmx_mm256_invsqrt_pd(rsq31);
+            rinv32           = gmx_mm256_invsqrt_pd(rsq32);
+            rinv33           = gmx_mm256_invsqrt_pd(rsq33);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+            rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
+            rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
+            rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
+            rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
+            rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
+            rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
+            rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
+
+            fjx0             = _mm256_setzero_pd();
+            fjy0             = _mm256_setzero_pd();
+            fjz0             = _mm256_setzero_pd();
+            fjx1             = _mm256_setzero_pd();
+            fjy1             = _mm256_setzero_pd();
+            fjz1             = _mm256_setzero_pd();
+            fjx2             = _mm256_setzero_pd();
+            fjy2             = _mm256_setzero_pd();
+            fjz2             = _mm256_setzero_pd();
+            fjx3             = _mm256_setzero_pd();
+            fjy3             = _mm256_setzero_pd();
+            fjz3             = _mm256_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+            r00              = _mm256_andnot_pd(dummy_mask,r00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = fvdw;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm256_mul_pd(rsq11,rinv11);
+            r11              = _mm256_andnot_pd(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r11,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx11);
+            ty               = _mm256_mul_pd(fscal,dy11);
+            tz               = _mm256_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm256_mul_pd(rsq12,rinv12);
+            r12              = _mm256_andnot_pd(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r12,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx12);
+            ty               = _mm256_mul_pd(fscal,dy12);
+            tz               = _mm256_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm256_mul_pd(rsq13,rinv13);
+            r13              = _mm256_andnot_pd(dummy_mask,r13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r13,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx13);
+            ty               = _mm256_mul_pd(fscal,dy13);
+            tz               = _mm256_mul_pd(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx3             = _mm256_add_pd(fjx3,tx);
+            fjy3             = _mm256_add_pd(fjy3,ty);
+            fjz3             = _mm256_add_pd(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm256_mul_pd(rsq21,rinv21);
+            r21              = _mm256_andnot_pd(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r21,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx21);
+            ty               = _mm256_mul_pd(fscal,dy21);
+            tz               = _mm256_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm256_mul_pd(rsq22,rinv22);
+            r22              = _mm256_andnot_pd(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r22,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx22);
+            ty               = _mm256_mul_pd(fscal,dy22);
+            tz               = _mm256_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm256_mul_pd(rsq23,rinv23);
+            r23              = _mm256_andnot_pd(dummy_mask,r23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r23,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx23);
+            ty               = _mm256_mul_pd(fscal,dy23);
+            tz               = _mm256_mul_pd(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx3             = _mm256_add_pd(fjx3,tx);
+            fjy3             = _mm256_add_pd(fjy3,ty);
+            fjz3             = _mm256_add_pd(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm256_mul_pd(rsq31,rinv31);
+            r31              = _mm256_andnot_pd(dummy_mask,r31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r31,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx31);
+            ty               = _mm256_mul_pd(fscal,dy31);
+            tz               = _mm256_mul_pd(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_pd(fix3,tx);
+            fiy3             = _mm256_add_pd(fiy3,ty);
+            fiz3             = _mm256_add_pd(fiz3,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm256_mul_pd(rsq32,rinv32);
+            r32              = _mm256_andnot_pd(dummy_mask,r32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r32,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx32);
+            ty               = _mm256_mul_pd(fscal,dy32);
+            tz               = _mm256_mul_pd(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_pd(fix3,tx);
+            fiy3             = _mm256_add_pd(fiy3,ty);
+            fiz3             = _mm256_add_pd(fiz3,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm256_mul_pd(rsq33,rinv33);
+            r33              = _mm256_andnot_pd(dummy_mask,r33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r33,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
+
+            cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx33);
+            ty               = _mm256_mul_pd(fscal,dy33);
+            tz               = _mm256_mul_pd(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_pd(fix3,tx);
+            fiy3             = _mm256_add_pd(fiy3,ty);
+            fiz3             = _mm256_add_pd(fiz3,tz);
+
+            fjx3             = _mm256_add_pd(fjx3,tx);
+            fjy3             = _mm256_add_pd(fjy3,ty);
+            fjz3             = _mm256_add_pd(fjz3,tz);
+
+            }
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                      fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                      fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 413 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*413);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwLJEw_GeomP1P1_avx_256_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwLJEw_GeomP1P1_avx_256_double.c
new file mode 100644 (file)
index 0000000..75c3421
--- /dev/null
@@ -0,0 +1,834 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_256_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_256_double.h"
+#include "kernelutil_x86_avx_256_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_avx_256_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_avx_256_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
+    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
+    __m256d           c6grid_00;
+    real             *vdwgridparam;
+    __m256d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256d           one_half  = _mm256_set1_pd(0.5);
+    __m256d           minus_one = _mm256_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m256d          dummy_mask,cutoff_mask;
+    __m128           tmpmask0,tmpmask1;
+    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
+    __m256d          one     = _mm256_set1_pd(1.0);
+    __m256d          two     = _mm256_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm256_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
+    beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
+    beta2            = _mm256_mul_pd(beta,beta);
+    beta3            = _mm256_mul_pd(beta,beta2);
+
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm256_setzero_pd();
+        fiy0             = _mm256_setzero_pd();
+        fiz0             = _mm256_setzero_pd();
+
+        /* Load parameters for i particles */
+        iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
+        vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+        vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _mm256_setzero_pd();
+        vvdwsum          = _mm256_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm256_mul_pd(iq0,jq0);
+            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r00,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm256_sub_pd(_mm256_mul_pd(vvdw12,one_twelfth),_mm256_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_pd(velecsum,velec);
+            vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm256_add_pd(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+            /* Inner loop uses 72 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
+             */
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+
+            tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
+            tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
+            dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
+
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+            r00              = _mm256_andnot_pd(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm256_mul_pd(iq0,jq0);
+            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r00,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm256_sub_pd(_mm256_mul_pd(vvdw12,one_twelfth),_mm256_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+            vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
+            vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm256_add_pd(felec,fvdw);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+            /* Inner loop uses 73 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*73);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_avx_256_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_avx_256_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
+    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
+    __m256d           c6grid_00;
+    real             *vdwgridparam;
+    __m256d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256d           one_half  = _mm256_set1_pd(0.5);
+    __m256d           minus_one = _mm256_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m256d          dummy_mask,cutoff_mask;
+    __m128           tmpmask0,tmpmask1;
+    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
+    __m256d          one     = _mm256_set1_pd(1.0);
+    __m256d          two     = _mm256_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm256_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
+    beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
+    beta2            = _mm256_mul_pd(beta,beta);
+    beta3            = _mm256_mul_pd(beta,beta2);
+
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm256_setzero_pd();
+        fiy0             = _mm256_setzero_pd();
+        fiz0             = _mm256_setzero_pd();
+
+        /* Load parameters for i particles */
+        iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
+        vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+        vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm256_mul_pd(iq0,jq0);
+            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r00,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = _mm256_add_pd(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+            /* Inner loop uses 59 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
+             */
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+
+            tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
+            tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
+            dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
+
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+            r00              = _mm256_andnot_pd(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm256_mul_pd(iq0,jq0);
+            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r00,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = _mm256_add_pd(felec,fvdw);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+            /* Inner loop uses 60 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*60);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwLJEw_GeomW3P1_avx_256_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwLJEw_GeomW3P1_avx_256_double.c
new file mode 100644 (file)
index 0000000..651be4c
--- /dev/null
@@ -0,0 +1,1300 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_256_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_256_double.h"
+#include "kernelutil_x86_avx_256_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_avx_256_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_avx_256_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    real *           vdwioffsetptr1;
+    real *           vdwgridioffsetptr1;
+    __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    real *           vdwioffsetptr2;
+    real *           vdwgridioffsetptr2;
+    __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
+    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
+    __m256d           c6grid_00;
+    __m256d           c6grid_10;
+    __m256d           c6grid_20;
+    real             *vdwgridparam;
+    __m256d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256d           one_half  = _mm256_set1_pd(0.5);
+    __m256d           minus_one = _mm256_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m256d          dummy_mask,cutoff_mask;
+    __m128           tmpmask0,tmpmask1;
+    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
+    __m256d          one     = _mm256_set1_pd(1.0);
+    __m256d          two     = _mm256_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm256_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
+    beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
+    beta2            = _mm256_mul_pd(beta,beta);
+    beta3            = _mm256_mul_pd(beta,beta2);
+
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
+    iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
+    iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
+    vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+    vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                    &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm256_setzero_pd();
+        fiy0             = _mm256_setzero_pd();
+        fiz0             = _mm256_setzero_pd();
+        fix1             = _mm256_setzero_pd();
+        fiy1             = _mm256_setzero_pd();
+        fiz1             = _mm256_setzero_pd();
+        fix2             = _mm256_setzero_pd();
+        fiy2             = _mm256_setzero_pd();
+        fiz2             = _mm256_setzero_pd();
+
+        /* Reset potential sums */
+        velecsum         = _mm256_setzero_pd();
+        vvdwsum          = _mm256_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+            dx10             = _mm256_sub_pd(ix1,jx0);
+            dy10             = _mm256_sub_pd(iy1,jy0);
+            dz10             = _mm256_sub_pd(iz1,jz0);
+            dx20             = _mm256_sub_pd(ix2,jx0);
+            dy20             = _mm256_sub_pd(iy2,jy0);
+            dz20             = _mm256_sub_pd(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm256_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm256_invsqrt_pd(rsq20);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm256_setzero_pd();
+            fjy0             = _mm256_setzero_pd();
+            fjz0             = _mm256_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm256_mul_pd(iq0,jq0);
+            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r00,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm256_sub_pd(_mm256_mul_pd(vvdw12,one_twelfth),_mm256_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_pd(velecsum,velec);
+            vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm256_add_pd(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm256_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm256_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r10,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx10);
+            ty               = _mm256_mul_pd(fscal,dy10);
+            tz               = _mm256_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm256_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm256_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r20,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx20);
+            ty               = _mm256_mul_pd(fscal,dy20);
+            tz               = _mm256_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 157 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
+             */
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+
+            tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
+            tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
+            dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
+
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+            dx10             = _mm256_sub_pd(ix1,jx0);
+            dy10             = _mm256_sub_pd(iy1,jy0);
+            dz10             = _mm256_sub_pd(iz1,jz0);
+            dx20             = _mm256_sub_pd(ix2,jx0);
+            dy20             = _mm256_sub_pd(iy2,jy0);
+            dz20             = _mm256_sub_pd(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm256_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm256_invsqrt_pd(rsq20);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm256_setzero_pd();
+            fjy0             = _mm256_setzero_pd();
+            fjz0             = _mm256_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+            r00              = _mm256_andnot_pd(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm256_mul_pd(iq0,jq0);
+            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r00,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm256_sub_pd(_mm256_mul_pd(vvdw12,one_twelfth),_mm256_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+            vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
+            vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm256_add_pd(felec,fvdw);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm256_mul_pd(rsq10,rinv10);
+            r10              = _mm256_andnot_pd(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm256_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r10,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx10);
+            ty               = _mm256_mul_pd(fscal,dy10);
+            tz               = _mm256_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm256_mul_pd(rsq20,rinv20);
+            r20              = _mm256_andnot_pd(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm256_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r20,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx20);
+            ty               = _mm256_mul_pd(fscal,dy20);
+            tz               = _mm256_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 160 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*160);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_avx_256_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_avx_256_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    real *           vdwioffsetptr1;
+    real *           vdwgridioffsetptr1;
+    __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    real *           vdwioffsetptr2;
+    real *           vdwgridioffsetptr2;
+    __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
+    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
+    __m256d           c6grid_00;
+    __m256d           c6grid_10;
+    __m256d           c6grid_20;
+    real             *vdwgridparam;
+    __m256d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256d           one_half  = _mm256_set1_pd(0.5);
+    __m256d           minus_one = _mm256_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m256d          dummy_mask,cutoff_mask;
+    __m128           tmpmask0,tmpmask1;
+    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
+    __m256d          one     = _mm256_set1_pd(1.0);
+    __m256d          two     = _mm256_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm256_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
+    beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
+    beta2            = _mm256_mul_pd(beta,beta);
+    beta3            = _mm256_mul_pd(beta,beta2);
+
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
+    iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
+    iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
+    vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+    vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                    &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm256_setzero_pd();
+        fiy0             = _mm256_setzero_pd();
+        fiz0             = _mm256_setzero_pd();
+        fix1             = _mm256_setzero_pd();
+        fiy1             = _mm256_setzero_pd();
+        fiz1             = _mm256_setzero_pd();
+        fix2             = _mm256_setzero_pd();
+        fiy2             = _mm256_setzero_pd();
+        fiz2             = _mm256_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+            dx10             = _mm256_sub_pd(ix1,jx0);
+            dy10             = _mm256_sub_pd(iy1,jy0);
+            dz10             = _mm256_sub_pd(iz1,jz0);
+            dx20             = _mm256_sub_pd(ix2,jx0);
+            dy20             = _mm256_sub_pd(iy2,jy0);
+            dz20             = _mm256_sub_pd(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm256_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm256_invsqrt_pd(rsq20);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm256_setzero_pd();
+            fjy0             = _mm256_setzero_pd();
+            fjz0             = _mm256_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm256_mul_pd(iq0,jq0);
+            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r00,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = _mm256_add_pd(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm256_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm256_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r10,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx10);
+            ty               = _mm256_mul_pd(fscal,dy10);
+            tz               = _mm256_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm256_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm256_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r20,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx20);
+            ty               = _mm256_mul_pd(fscal,dy20);
+            tz               = _mm256_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 134 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
+             */
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+
+            tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
+            tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
+            dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
+
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+            dx10             = _mm256_sub_pd(ix1,jx0);
+            dy10             = _mm256_sub_pd(iy1,jy0);
+            dz10             = _mm256_sub_pd(iz1,jz0);
+            dx20             = _mm256_sub_pd(ix2,jx0);
+            dy20             = _mm256_sub_pd(iy2,jy0);
+            dz20             = _mm256_sub_pd(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm256_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm256_invsqrt_pd(rsq20);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm256_setzero_pd();
+            fjy0             = _mm256_setzero_pd();
+            fjz0             = _mm256_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+            r00              = _mm256_andnot_pd(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm256_mul_pd(iq0,jq0);
+            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r00,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = _mm256_add_pd(felec,fvdw);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm256_mul_pd(rsq10,rinv10);
+            r10              = _mm256_andnot_pd(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm256_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r10,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx10);
+            ty               = _mm256_mul_pd(fscal,dy10);
+            tz               = _mm256_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm256_mul_pd(rsq20,rinv20);
+            r20              = _mm256_andnot_pd(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm256_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r20,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx20);
+            ty               = _mm256_mul_pd(fscal,dy20);
+            tz               = _mm256_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 137 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*137);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwLJEw_GeomW3W3_avx_256_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwLJEw_GeomW3W3_avx_256_double.c
new file mode 100644 (file)
index 0000000..7760286
--- /dev/null
@@ -0,0 +1,2384 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_256_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_256_double.h"
+#include "kernelutil_x86_avx_256_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_avx_256_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_avx_256_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    real *           vdwioffsetptr1;
+    real *           vdwgridioffsetptr1;
+    __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    real *           vdwioffsetptr2;
+    real *           vdwgridioffsetptr2;
+    __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m256d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m256d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
+    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
+    __m256d           c6grid_00;
+    __m256d           c6grid_01;
+    __m256d           c6grid_02;
+    __m256d           c6grid_10;
+    __m256d           c6grid_11;
+    __m256d           c6grid_12;
+    __m256d           c6grid_20;
+    __m256d           c6grid_21;
+    __m256d           c6grid_22;
+    real             *vdwgridparam;
+    __m256d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256d           one_half  = _mm256_set1_pd(0.5);
+    __m256d           minus_one = _mm256_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m256d          dummy_mask,cutoff_mask;
+    __m128           tmpmask0,tmpmask1;
+    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
+    __m256d          one     = _mm256_set1_pd(1.0);
+    __m256d          two     = _mm256_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm256_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
+    beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
+    beta2            = _mm256_mul_pd(beta,beta);
+    beta3            = _mm256_mul_pd(beta,beta2);
+
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
+    iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
+    iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
+    vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+    vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm256_set1_pd(charge[inr+0]);
+    jq1              = _mm256_set1_pd(charge[inr+1]);
+    jq2              = _mm256_set1_pd(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm256_mul_pd(iq0,jq0);
+    c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
+    c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
+    c6grid_00        = _mm256_set1_pd(vdwgridioffsetptr0[vdwjidx0A]);
+    qq01             = _mm256_mul_pd(iq0,jq1);
+    qq02             = _mm256_mul_pd(iq0,jq2);
+    qq10             = _mm256_mul_pd(iq1,jq0);
+    qq11             = _mm256_mul_pd(iq1,jq1);
+    qq12             = _mm256_mul_pd(iq1,jq2);
+    qq20             = _mm256_mul_pd(iq2,jq0);
+    qq21             = _mm256_mul_pd(iq2,jq1);
+    qq22             = _mm256_mul_pd(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                    &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm256_setzero_pd();
+        fiy0             = _mm256_setzero_pd();
+        fiz0             = _mm256_setzero_pd();
+        fix1             = _mm256_setzero_pd();
+        fiy1             = _mm256_setzero_pd();
+        fiz1             = _mm256_setzero_pd();
+        fix2             = _mm256_setzero_pd();
+        fiy2             = _mm256_setzero_pd();
+        fiz2             = _mm256_setzero_pd();
+
+        /* Reset potential sums */
+        velecsum         = _mm256_setzero_pd();
+        vvdwsum          = _mm256_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+            dx01             = _mm256_sub_pd(ix0,jx1);
+            dy01             = _mm256_sub_pd(iy0,jy1);
+            dz01             = _mm256_sub_pd(iz0,jz1);
+            dx02             = _mm256_sub_pd(ix0,jx2);
+            dy02             = _mm256_sub_pd(iy0,jy2);
+            dz02             = _mm256_sub_pd(iz0,jz2);
+            dx10             = _mm256_sub_pd(ix1,jx0);
+            dy10             = _mm256_sub_pd(iy1,jy0);
+            dz10             = _mm256_sub_pd(iz1,jz0);
+            dx11             = _mm256_sub_pd(ix1,jx1);
+            dy11             = _mm256_sub_pd(iy1,jy1);
+            dz11             = _mm256_sub_pd(iz1,jz1);
+            dx12             = _mm256_sub_pd(ix1,jx2);
+            dy12             = _mm256_sub_pd(iy1,jy2);
+            dz12             = _mm256_sub_pd(iz1,jz2);
+            dx20             = _mm256_sub_pd(ix2,jx0);
+            dy20             = _mm256_sub_pd(iy2,jy0);
+            dz20             = _mm256_sub_pd(iz2,jz0);
+            dx21             = _mm256_sub_pd(ix2,jx1);
+            dy21             = _mm256_sub_pd(iy2,jy1);
+            dz21             = _mm256_sub_pd(iz2,jz1);
+            dx22             = _mm256_sub_pd(ix2,jx2);
+            dy22             = _mm256_sub_pd(iy2,jy2);
+            dz22             = _mm256_sub_pd(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+            rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
+            rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
+            rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
+            rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
+            rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
+            rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+            rinv01           = gmx_mm256_invsqrt_pd(rsq01);
+            rinv02           = gmx_mm256_invsqrt_pd(rsq02);
+            rinv10           = gmx_mm256_invsqrt_pd(rsq10);
+            rinv11           = gmx_mm256_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm256_invsqrt_pd(rsq12);
+            rinv20           = gmx_mm256_invsqrt_pd(rsq20);
+            rinv21           = gmx_mm256_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm256_invsqrt_pd(rsq22);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+            rinvsq01         = _mm256_mul_pd(rinv01,rinv01);
+            rinvsq02         = _mm256_mul_pd(rinv02,rinv02);
+            rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
+            rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
+            rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
+            rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
+
+            fjx0             = _mm256_setzero_pd();
+            fjy0             = _mm256_setzero_pd();
+            fjz0             = _mm256_setzero_pd();
+            fjx1             = _mm256_setzero_pd();
+            fjy1             = _mm256_setzero_pd();
+            fjz1             = _mm256_setzero_pd();
+            fjx2             = _mm256_setzero_pd();
+            fjy2             = _mm256_setzero_pd();
+            fjz2             = _mm256_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r00,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm256_sub_pd(_mm256_mul_pd(vvdw12,one_twelfth),_mm256_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_pd(velecsum,velec);
+            vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm256_add_pd(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm256_mul_pd(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r01,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq01,_mm256_sub_pd(rinv01,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx01);
+            ty               = _mm256_mul_pd(fscal,dy01);
+            tz               = _mm256_mul_pd(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm256_mul_pd(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r02,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq02,_mm256_sub_pd(rinv02,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx02);
+            ty               = _mm256_mul_pd(fscal,dy02);
+            tz               = _mm256_mul_pd(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm256_mul_pd(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r10,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx10);
+            ty               = _mm256_mul_pd(fscal,dy10);
+            tz               = _mm256_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm256_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r11,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx11);
+            ty               = _mm256_mul_pd(fscal,dy11);
+            tz               = _mm256_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm256_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r12,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx12);
+            ty               = _mm256_mul_pd(fscal,dy12);
+            tz               = _mm256_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm256_mul_pd(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r20,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx20);
+            ty               = _mm256_mul_pd(fscal,dy20);
+            tz               = _mm256_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm256_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r21,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx21);
+            ty               = _mm256_mul_pd(fscal,dy21);
+            tz               = _mm256_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm256_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r22,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx22);
+            ty               = _mm256_mul_pd(fscal,dy22);
+            tz               = _mm256_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                      fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 400 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
+             */
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+
+            tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
+            tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
+            dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
+
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+            dx01             = _mm256_sub_pd(ix0,jx1);
+            dy01             = _mm256_sub_pd(iy0,jy1);
+            dz01             = _mm256_sub_pd(iz0,jz1);
+            dx02             = _mm256_sub_pd(ix0,jx2);
+            dy02             = _mm256_sub_pd(iy0,jy2);
+            dz02             = _mm256_sub_pd(iz0,jz2);
+            dx10             = _mm256_sub_pd(ix1,jx0);
+            dy10             = _mm256_sub_pd(iy1,jy0);
+            dz10             = _mm256_sub_pd(iz1,jz0);
+            dx11             = _mm256_sub_pd(ix1,jx1);
+            dy11             = _mm256_sub_pd(iy1,jy1);
+            dz11             = _mm256_sub_pd(iz1,jz1);
+            dx12             = _mm256_sub_pd(ix1,jx2);
+            dy12             = _mm256_sub_pd(iy1,jy2);
+            dz12             = _mm256_sub_pd(iz1,jz2);
+            dx20             = _mm256_sub_pd(ix2,jx0);
+            dy20             = _mm256_sub_pd(iy2,jy0);
+            dz20             = _mm256_sub_pd(iz2,jz0);
+            dx21             = _mm256_sub_pd(ix2,jx1);
+            dy21             = _mm256_sub_pd(iy2,jy1);
+            dz21             = _mm256_sub_pd(iz2,jz1);
+            dx22             = _mm256_sub_pd(ix2,jx2);
+            dy22             = _mm256_sub_pd(iy2,jy2);
+            dz22             = _mm256_sub_pd(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+            rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
+            rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
+            rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
+            rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
+            rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
+            rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+            rinv01           = gmx_mm256_invsqrt_pd(rsq01);
+            rinv02           = gmx_mm256_invsqrt_pd(rsq02);
+            rinv10           = gmx_mm256_invsqrt_pd(rsq10);
+            rinv11           = gmx_mm256_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm256_invsqrt_pd(rsq12);
+            rinv20           = gmx_mm256_invsqrt_pd(rsq20);
+            rinv21           = gmx_mm256_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm256_invsqrt_pd(rsq22);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+            rinvsq01         = _mm256_mul_pd(rinv01,rinv01);
+            rinvsq02         = _mm256_mul_pd(rinv02,rinv02);
+            rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
+            rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
+            rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
+            rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
+
+            fjx0             = _mm256_setzero_pd();
+            fjy0             = _mm256_setzero_pd();
+            fjz0             = _mm256_setzero_pd();
+            fjx1             = _mm256_setzero_pd();
+            fjy1             = _mm256_setzero_pd();
+            fjz1             = _mm256_setzero_pd();
+            fjx2             = _mm256_setzero_pd();
+            fjy2             = _mm256_setzero_pd();
+            fjz2             = _mm256_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+            r00              = _mm256_andnot_pd(dummy_mask,r00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r00,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm256_sub_pd(_mm256_mul_pd(vvdw12,one_twelfth),_mm256_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+            vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
+            vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm256_add_pd(felec,fvdw);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm256_mul_pd(rsq01,rinv01);
+            r01              = _mm256_andnot_pd(dummy_mask,r01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r01,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq01,_mm256_sub_pd(rinv01,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx01);
+            ty               = _mm256_mul_pd(fscal,dy01);
+            tz               = _mm256_mul_pd(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm256_mul_pd(rsq02,rinv02);
+            r02              = _mm256_andnot_pd(dummy_mask,r02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r02,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq02,_mm256_sub_pd(rinv02,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx02);
+            ty               = _mm256_mul_pd(fscal,dy02);
+            tz               = _mm256_mul_pd(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm256_mul_pd(rsq10,rinv10);
+            r10              = _mm256_andnot_pd(dummy_mask,r10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r10,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx10);
+            ty               = _mm256_mul_pd(fscal,dy10);
+            tz               = _mm256_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm256_mul_pd(rsq11,rinv11);
+            r11              = _mm256_andnot_pd(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r11,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx11);
+            ty               = _mm256_mul_pd(fscal,dy11);
+            tz               = _mm256_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm256_mul_pd(rsq12,rinv12);
+            r12              = _mm256_andnot_pd(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r12,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx12);
+            ty               = _mm256_mul_pd(fscal,dy12);
+            tz               = _mm256_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm256_mul_pd(rsq20,rinv20);
+            r20              = _mm256_andnot_pd(dummy_mask,r20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r20,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx20);
+            ty               = _mm256_mul_pd(fscal,dy20);
+            tz               = _mm256_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm256_mul_pd(rsq21,rinv21);
+            r21              = _mm256_andnot_pd(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r21,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx21);
+            ty               = _mm256_mul_pd(fscal,dy21);
+            tz               = _mm256_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm256_mul_pd(rsq22,rinv22);
+            r22              = _mm256_andnot_pd(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r22,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx22);
+            ty               = _mm256_mul_pd(fscal,dy22);
+            tz               = _mm256_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                      fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 409 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*409);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_avx_256_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_avx_256_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    real *           vdwioffsetptr1;
+    real *           vdwgridioffsetptr1;
+    __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    real *           vdwioffsetptr2;
+    real *           vdwgridioffsetptr2;
+    __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m256d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m256d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
+    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
+    __m256d           c6grid_00;
+    __m256d           c6grid_01;
+    __m256d           c6grid_02;
+    __m256d           c6grid_10;
+    __m256d           c6grid_11;
+    __m256d           c6grid_12;
+    __m256d           c6grid_20;
+    __m256d           c6grid_21;
+    __m256d           c6grid_22;
+    real             *vdwgridparam;
+    __m256d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256d           one_half  = _mm256_set1_pd(0.5);
+    __m256d           minus_one = _mm256_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m256d          dummy_mask,cutoff_mask;
+    __m128           tmpmask0,tmpmask1;
+    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
+    __m256d          one     = _mm256_set1_pd(1.0);
+    __m256d          two     = _mm256_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm256_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
+    beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
+    beta2            = _mm256_mul_pd(beta,beta);
+    beta3            = _mm256_mul_pd(beta,beta2);
+
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
+    iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
+    iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
+    vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+    vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm256_set1_pd(charge[inr+0]);
+    jq1              = _mm256_set1_pd(charge[inr+1]);
+    jq2              = _mm256_set1_pd(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm256_mul_pd(iq0,jq0);
+    c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
+    c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
+    c6grid_00        = _mm256_set1_pd(vdwgridioffsetptr0[vdwjidx0A]);
+    qq01             = _mm256_mul_pd(iq0,jq1);
+    qq02             = _mm256_mul_pd(iq0,jq2);
+    qq10             = _mm256_mul_pd(iq1,jq0);
+    qq11             = _mm256_mul_pd(iq1,jq1);
+    qq12             = _mm256_mul_pd(iq1,jq2);
+    qq20             = _mm256_mul_pd(iq2,jq0);
+    qq21             = _mm256_mul_pd(iq2,jq1);
+    qq22             = _mm256_mul_pd(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                    &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm256_setzero_pd();
+        fiy0             = _mm256_setzero_pd();
+        fiz0             = _mm256_setzero_pd();
+        fix1             = _mm256_setzero_pd();
+        fiy1             = _mm256_setzero_pd();
+        fiz1             = _mm256_setzero_pd();
+        fix2             = _mm256_setzero_pd();
+        fiy2             = _mm256_setzero_pd();
+        fiz2             = _mm256_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+            dx01             = _mm256_sub_pd(ix0,jx1);
+            dy01             = _mm256_sub_pd(iy0,jy1);
+            dz01             = _mm256_sub_pd(iz0,jz1);
+            dx02             = _mm256_sub_pd(ix0,jx2);
+            dy02             = _mm256_sub_pd(iy0,jy2);
+            dz02             = _mm256_sub_pd(iz0,jz2);
+            dx10             = _mm256_sub_pd(ix1,jx0);
+            dy10             = _mm256_sub_pd(iy1,jy0);
+            dz10             = _mm256_sub_pd(iz1,jz0);
+            dx11             = _mm256_sub_pd(ix1,jx1);
+            dy11             = _mm256_sub_pd(iy1,jy1);
+            dz11             = _mm256_sub_pd(iz1,jz1);
+            dx12             = _mm256_sub_pd(ix1,jx2);
+            dy12             = _mm256_sub_pd(iy1,jy2);
+            dz12             = _mm256_sub_pd(iz1,jz2);
+            dx20             = _mm256_sub_pd(ix2,jx0);
+            dy20             = _mm256_sub_pd(iy2,jy0);
+            dz20             = _mm256_sub_pd(iz2,jz0);
+            dx21             = _mm256_sub_pd(ix2,jx1);
+            dy21             = _mm256_sub_pd(iy2,jy1);
+            dz21             = _mm256_sub_pd(iz2,jz1);
+            dx22             = _mm256_sub_pd(ix2,jx2);
+            dy22             = _mm256_sub_pd(iy2,jy2);
+            dz22             = _mm256_sub_pd(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+            rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
+            rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
+            rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
+            rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
+            rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
+            rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+            rinv01           = gmx_mm256_invsqrt_pd(rsq01);
+            rinv02           = gmx_mm256_invsqrt_pd(rsq02);
+            rinv10           = gmx_mm256_invsqrt_pd(rsq10);
+            rinv11           = gmx_mm256_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm256_invsqrt_pd(rsq12);
+            rinv20           = gmx_mm256_invsqrt_pd(rsq20);
+            rinv21           = gmx_mm256_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm256_invsqrt_pd(rsq22);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+            rinvsq01         = _mm256_mul_pd(rinv01,rinv01);
+            rinvsq02         = _mm256_mul_pd(rinv02,rinv02);
+            rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
+            rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
+            rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
+            rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
+
+            fjx0             = _mm256_setzero_pd();
+            fjy0             = _mm256_setzero_pd();
+            fjz0             = _mm256_setzero_pd();
+            fjx1             = _mm256_setzero_pd();
+            fjy1             = _mm256_setzero_pd();
+            fjz1             = _mm256_setzero_pd();
+            fjx2             = _mm256_setzero_pd();
+            fjy2             = _mm256_setzero_pd();
+            fjz2             = _mm256_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r00,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = _mm256_add_pd(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm256_mul_pd(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r01,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx01);
+            ty               = _mm256_mul_pd(fscal,dy01);
+            tz               = _mm256_mul_pd(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm256_mul_pd(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r02,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx02);
+            ty               = _mm256_mul_pd(fscal,dy02);
+            tz               = _mm256_mul_pd(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm256_mul_pd(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r10,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx10);
+            ty               = _mm256_mul_pd(fscal,dy10);
+            tz               = _mm256_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm256_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r11,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx11);
+            ty               = _mm256_mul_pd(fscal,dy11);
+            tz               = _mm256_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm256_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r12,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx12);
+            ty               = _mm256_mul_pd(fscal,dy12);
+            tz               = _mm256_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm256_mul_pd(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r20,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx20);
+            ty               = _mm256_mul_pd(fscal,dy20);
+            tz               = _mm256_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm256_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r21,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx21);
+            ty               = _mm256_mul_pd(fscal,dy21);
+            tz               = _mm256_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm256_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r22,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx22);
+            ty               = _mm256_mul_pd(fscal,dy22);
+            tz               = _mm256_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                      fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 347 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
+             */
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+
+            tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
+            tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
+            dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
+
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+            dx01             = _mm256_sub_pd(ix0,jx1);
+            dy01             = _mm256_sub_pd(iy0,jy1);
+            dz01             = _mm256_sub_pd(iz0,jz1);
+            dx02             = _mm256_sub_pd(ix0,jx2);
+            dy02             = _mm256_sub_pd(iy0,jy2);
+            dz02             = _mm256_sub_pd(iz0,jz2);
+            dx10             = _mm256_sub_pd(ix1,jx0);
+            dy10             = _mm256_sub_pd(iy1,jy0);
+            dz10             = _mm256_sub_pd(iz1,jz0);
+            dx11             = _mm256_sub_pd(ix1,jx1);
+            dy11             = _mm256_sub_pd(iy1,jy1);
+            dz11             = _mm256_sub_pd(iz1,jz1);
+            dx12             = _mm256_sub_pd(ix1,jx2);
+            dy12             = _mm256_sub_pd(iy1,jy2);
+            dz12             = _mm256_sub_pd(iz1,jz2);
+            dx20             = _mm256_sub_pd(ix2,jx0);
+            dy20             = _mm256_sub_pd(iy2,jy0);
+            dz20             = _mm256_sub_pd(iz2,jz0);
+            dx21             = _mm256_sub_pd(ix2,jx1);
+            dy21             = _mm256_sub_pd(iy2,jy1);
+            dz21             = _mm256_sub_pd(iz2,jz1);
+            dx22             = _mm256_sub_pd(ix2,jx2);
+            dy22             = _mm256_sub_pd(iy2,jy2);
+            dz22             = _mm256_sub_pd(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+            rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
+            rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
+            rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
+            rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
+            rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
+            rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+            rinv01           = gmx_mm256_invsqrt_pd(rsq01);
+            rinv02           = gmx_mm256_invsqrt_pd(rsq02);
+            rinv10           = gmx_mm256_invsqrt_pd(rsq10);
+            rinv11           = gmx_mm256_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm256_invsqrt_pd(rsq12);
+            rinv20           = gmx_mm256_invsqrt_pd(rsq20);
+            rinv21           = gmx_mm256_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm256_invsqrt_pd(rsq22);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+            rinvsq01         = _mm256_mul_pd(rinv01,rinv01);
+            rinvsq02         = _mm256_mul_pd(rinv02,rinv02);
+            rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
+            rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
+            rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
+            rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
+
+            fjx0             = _mm256_setzero_pd();
+            fjy0             = _mm256_setzero_pd();
+            fjz0             = _mm256_setzero_pd();
+            fjx1             = _mm256_setzero_pd();
+            fjy1             = _mm256_setzero_pd();
+            fjz1             = _mm256_setzero_pd();
+            fjx2             = _mm256_setzero_pd();
+            fjy2             = _mm256_setzero_pd();
+            fjz2             = _mm256_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+            r00              = _mm256_andnot_pd(dummy_mask,r00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r00,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = _mm256_add_pd(felec,fvdw);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm256_mul_pd(rsq01,rinv01);
+            r01              = _mm256_andnot_pd(dummy_mask,r01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r01,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx01);
+            ty               = _mm256_mul_pd(fscal,dy01);
+            tz               = _mm256_mul_pd(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm256_mul_pd(rsq02,rinv02);
+            r02              = _mm256_andnot_pd(dummy_mask,r02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r02,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx02);
+            ty               = _mm256_mul_pd(fscal,dy02);
+            tz               = _mm256_mul_pd(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm256_mul_pd(rsq10,rinv10);
+            r10              = _mm256_andnot_pd(dummy_mask,r10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r10,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx10);
+            ty               = _mm256_mul_pd(fscal,dy10);
+            tz               = _mm256_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm256_mul_pd(rsq11,rinv11);
+            r11              = _mm256_andnot_pd(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r11,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx11);
+            ty               = _mm256_mul_pd(fscal,dy11);
+            tz               = _mm256_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm256_mul_pd(rsq12,rinv12);
+            r12              = _mm256_andnot_pd(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r12,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx12);
+            ty               = _mm256_mul_pd(fscal,dy12);
+            tz               = _mm256_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm256_mul_pd(rsq20,rinv20);
+            r20              = _mm256_andnot_pd(dummy_mask,r20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r20,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx20);
+            ty               = _mm256_mul_pd(fscal,dy20);
+            tz               = _mm256_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm256_mul_pd(rsq21,rinv21);
+            r21              = _mm256_andnot_pd(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r21,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx21);
+            ty               = _mm256_mul_pd(fscal,dy21);
+            tz               = _mm256_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm256_mul_pd(rsq22,rinv22);
+            r22              = _mm256_andnot_pd(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r22,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx22);
+            ty               = _mm256_mul_pd(fscal,dy22);
+            tz               = _mm256_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                      fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 356 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*356);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwLJEw_GeomW4P1_avx_256_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwLJEw_GeomW4P1_avx_256_double.c
new file mode 100644 (file)
index 0000000..294e597
--- /dev/null
@@ -0,0 +1,1446 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_256_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_256_double.h"
+#include "kernelutil_x86_avx_256_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_avx_256_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_avx_256_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    real *           vdwioffsetptr1;
+    real *           vdwgridioffsetptr1;
+    __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    real *           vdwioffsetptr2;
+    real *           vdwgridioffsetptr2;
+    __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    real *           vdwioffsetptr3;
+    real *           vdwgridioffsetptr3;
+    __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m256d          dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
+    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
+    __m256d           c6grid_00;
+    __m256d           c6grid_10;
+    __m256d           c6grid_20;
+    __m256d           c6grid_30;
+    real             *vdwgridparam;
+    __m256d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256d           one_half  = _mm256_set1_pd(0.5);
+    __m256d           minus_one = _mm256_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m256d          dummy_mask,cutoff_mask;
+    __m128           tmpmask0,tmpmask1;
+    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
+    __m256d          one     = _mm256_set1_pd(1.0);
+    __m256d          two     = _mm256_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm256_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
+    beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
+    beta2            = _mm256_mul_pd(beta,beta);
+    beta3            = _mm256_mul_pd(beta,beta2);
+
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
+    iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
+    iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
+    vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+    vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                    &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm256_setzero_pd();
+        fiy0             = _mm256_setzero_pd();
+        fiz0             = _mm256_setzero_pd();
+        fix1             = _mm256_setzero_pd();
+        fiy1             = _mm256_setzero_pd();
+        fiz1             = _mm256_setzero_pd();
+        fix2             = _mm256_setzero_pd();
+        fiy2             = _mm256_setzero_pd();
+        fiz2             = _mm256_setzero_pd();
+        fix3             = _mm256_setzero_pd();
+        fiy3             = _mm256_setzero_pd();
+        fiz3             = _mm256_setzero_pd();
+
+        /* Reset potential sums */
+        velecsum         = _mm256_setzero_pd();
+        vvdwsum          = _mm256_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+            dx10             = _mm256_sub_pd(ix1,jx0);
+            dy10             = _mm256_sub_pd(iy1,jy0);
+            dz10             = _mm256_sub_pd(iz1,jz0);
+            dx20             = _mm256_sub_pd(ix2,jx0);
+            dy20             = _mm256_sub_pd(iy2,jy0);
+            dz20             = _mm256_sub_pd(iz2,jz0);
+            dx30             = _mm256_sub_pd(ix3,jx0);
+            dy30             = _mm256_sub_pd(iy3,jy0);
+            dz30             = _mm256_sub_pd(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
+            rsq30            = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm256_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm256_invsqrt_pd(rsq20);
+            rinv30           = gmx_mm256_invsqrt_pd(rsq30);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
+            rinvsq30         = _mm256_mul_pd(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm256_setzero_pd();
+            fjy0             = _mm256_setzero_pd();
+            fjz0             = _mm256_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm256_sub_pd(_mm256_mul_pd(vvdw12,one_twelfth),_mm256_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm256_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm256_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r10,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx10);
+            ty               = _mm256_mul_pd(fscal,dy10);
+            tz               = _mm256_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm256_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm256_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r20,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx20);
+            ty               = _mm256_mul_pd(fscal,dy20);
+            tz               = _mm256_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm256_mul_pd(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm256_mul_pd(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r30,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq30,_mm256_sub_pd(rinv30,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq30,rinv30),_mm256_sub_pd(rinvsq30,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx30);
+            ty               = _mm256_mul_pd(fscal,dy30);
+            tz               = _mm256_mul_pd(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_pd(fix3,tx);
+            fiy3             = _mm256_add_pd(fiy3,ty);
+            fiz3             = _mm256_add_pd(fiz3,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 180 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
+             */
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+
+            tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
+            tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
+            dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
+
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+            dx10             = _mm256_sub_pd(ix1,jx0);
+            dy10             = _mm256_sub_pd(iy1,jy0);
+            dz10             = _mm256_sub_pd(iz1,jz0);
+            dx20             = _mm256_sub_pd(ix2,jx0);
+            dy20             = _mm256_sub_pd(iy2,jy0);
+            dz20             = _mm256_sub_pd(iz2,jz0);
+            dx30             = _mm256_sub_pd(ix3,jx0);
+            dy30             = _mm256_sub_pd(iy3,jy0);
+            dz30             = _mm256_sub_pd(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
+            rsq30            = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm256_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm256_invsqrt_pd(rsq20);
+            rinv30           = gmx_mm256_invsqrt_pd(rsq30);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
+            rinvsq30         = _mm256_mul_pd(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm256_setzero_pd();
+            fjy0             = _mm256_setzero_pd();
+            fjz0             = _mm256_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+            r00              = _mm256_andnot_pd(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm256_sub_pd(_mm256_mul_pd(vvdw12,one_twelfth),_mm256_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
+            vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm256_mul_pd(rsq10,rinv10);
+            r10              = _mm256_andnot_pd(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm256_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r10,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx10);
+            ty               = _mm256_mul_pd(fscal,dy10);
+            tz               = _mm256_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm256_mul_pd(rsq20,rinv20);
+            r20              = _mm256_andnot_pd(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm256_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r20,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx20);
+            ty               = _mm256_mul_pd(fscal,dy20);
+            tz               = _mm256_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm256_mul_pd(rsq30,rinv30);
+            r30              = _mm256_andnot_pd(dummy_mask,r30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm256_mul_pd(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r30,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq30,_mm256_sub_pd(rinv30,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq30,rinv30),_mm256_sub_pd(rinvsq30,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx30);
+            ty               = _mm256_mul_pd(fscal,dy30);
+            tz               = _mm256_mul_pd(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_pd(fix3,tx);
+            fiy3             = _mm256_add_pd(fiy3,ty);
+            fiz3             = _mm256_add_pd(fiz3,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 184 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*184);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_avx_256_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_avx_256_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    real *           vdwioffsetptr1;
+    real *           vdwgridioffsetptr1;
+    __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    real *           vdwioffsetptr2;
+    real *           vdwgridioffsetptr2;
+    __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    real *           vdwioffsetptr3;
+    real *           vdwgridioffsetptr3;
+    __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m256d          dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
+    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
+    __m256d           c6grid_00;
+    __m256d           c6grid_10;
+    __m256d           c6grid_20;
+    __m256d           c6grid_30;
+    real             *vdwgridparam;
+    __m256d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256d           one_half  = _mm256_set1_pd(0.5);
+    __m256d           minus_one = _mm256_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m256d          dummy_mask,cutoff_mask;
+    __m128           tmpmask0,tmpmask1;
+    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
+    __m256d          one     = _mm256_set1_pd(1.0);
+    __m256d          two     = _mm256_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm256_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
+    beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
+    beta2            = _mm256_mul_pd(beta,beta);
+    beta3            = _mm256_mul_pd(beta,beta2);
+
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
+    iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
+    iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
+    vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+    vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                    &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm256_setzero_pd();
+        fiy0             = _mm256_setzero_pd();
+        fiz0             = _mm256_setzero_pd();
+        fix1             = _mm256_setzero_pd();
+        fiy1             = _mm256_setzero_pd();
+        fiz1             = _mm256_setzero_pd();
+        fix2             = _mm256_setzero_pd();
+        fiy2             = _mm256_setzero_pd();
+        fiz2             = _mm256_setzero_pd();
+        fix3             = _mm256_setzero_pd();
+        fiy3             = _mm256_setzero_pd();
+        fiz3             = _mm256_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+            dx10             = _mm256_sub_pd(ix1,jx0);
+            dy10             = _mm256_sub_pd(iy1,jy0);
+            dz10             = _mm256_sub_pd(iz1,jz0);
+            dx20             = _mm256_sub_pd(ix2,jx0);
+            dy20             = _mm256_sub_pd(iy2,jy0);
+            dz20             = _mm256_sub_pd(iz2,jz0);
+            dx30             = _mm256_sub_pd(ix3,jx0);
+            dy30             = _mm256_sub_pd(iy3,jy0);
+            dz30             = _mm256_sub_pd(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
+            rsq30            = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm256_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm256_invsqrt_pd(rsq20);
+            rinv30           = gmx_mm256_invsqrt_pd(rsq30);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
+            rinvsq30         = _mm256_mul_pd(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm256_setzero_pd();
+            fjy0             = _mm256_setzero_pd();
+            fjz0             = _mm256_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm256_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm256_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r10,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx10);
+            ty               = _mm256_mul_pd(fscal,dy10);
+            tz               = _mm256_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm256_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm256_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r20,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx20);
+            ty               = _mm256_mul_pd(fscal,dy20);
+            tz               = _mm256_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm256_mul_pd(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm256_mul_pd(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r30,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq30,rinv30),_mm256_sub_pd(rinvsq30,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx30);
+            ty               = _mm256_mul_pd(fscal,dy30);
+            tz               = _mm256_mul_pd(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_pd(fix3,tx);
+            fiy3             = _mm256_add_pd(fiy3,ty);
+            fiz3             = _mm256_add_pd(fiz3,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 157 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
+             */
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+
+            tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
+            tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
+            dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
+
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+            dx10             = _mm256_sub_pd(ix1,jx0);
+            dy10             = _mm256_sub_pd(iy1,jy0);
+            dz10             = _mm256_sub_pd(iz1,jz0);
+            dx20             = _mm256_sub_pd(ix2,jx0);
+            dy20             = _mm256_sub_pd(iy2,jy0);
+            dz20             = _mm256_sub_pd(iz2,jz0);
+            dx30             = _mm256_sub_pd(ix3,jx0);
+            dy30             = _mm256_sub_pd(iy3,jy0);
+            dz30             = _mm256_sub_pd(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
+            rsq30            = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm256_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm256_invsqrt_pd(rsq20);
+            rinv30           = gmx_mm256_invsqrt_pd(rsq30);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
+            rinvsq30         = _mm256_mul_pd(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm256_setzero_pd();
+            fjy0             = _mm256_setzero_pd();
+            fjz0             = _mm256_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+            r00              = _mm256_andnot_pd(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm256_mul_pd(rsq10,rinv10);
+            r10              = _mm256_andnot_pd(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm256_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r10,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx10);
+            ty               = _mm256_mul_pd(fscal,dy10);
+            tz               = _mm256_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm256_mul_pd(rsq20,rinv20);
+            r20              = _mm256_andnot_pd(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm256_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r20,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx20);
+            ty               = _mm256_mul_pd(fscal,dy20);
+            tz               = _mm256_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm256_mul_pd(rsq30,rinv30);
+            r30              = _mm256_andnot_pd(dummy_mask,r30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm256_mul_pd(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r30,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq30,rinv30),_mm256_sub_pd(rinvsq30,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx30);
+            ty               = _mm256_mul_pd(fscal,dy30);
+            tz               = _mm256_mul_pd(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_pd(fix3,tx);
+            fiy3             = _mm256_add_pd(fiy3,ty);
+            fiz3             = _mm256_add_pd(fiz3,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 161 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*161);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwLJEw_GeomW4W4_avx_256_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecEw_VdwLJEw_GeomW4W4_avx_256_double.c
new file mode 100644 (file)
index 0000000..3fe41bb
--- /dev/null
@@ -0,0 +1,2546 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_256_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_256_double.h"
+#include "kernelutil_x86_avx_256_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_avx_256_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_avx_256_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    real *           vdwioffsetptr1;
+    real *           vdwgridioffsetptr1;
+    __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    real *           vdwioffsetptr2;
+    real *           vdwgridioffsetptr2;
+    __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    real *           vdwioffsetptr3;
+    real *           vdwgridioffsetptr3;
+    __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m256d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m256d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m256d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m256d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m256d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m256d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
+    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
+    __m256d           c6grid_00;
+    __m256d           c6grid_11;
+    __m256d           c6grid_12;
+    __m256d           c6grid_13;
+    __m256d           c6grid_21;
+    __m256d           c6grid_22;
+    __m256d           c6grid_23;
+    __m256d           c6grid_31;
+    __m256d           c6grid_32;
+    __m256d           c6grid_33;
+    real             *vdwgridparam;
+    __m256d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256d           one_half  = _mm256_set1_pd(0.5);
+    __m256d           minus_one = _mm256_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m256d          dummy_mask,cutoff_mask;
+    __m128           tmpmask0,tmpmask1;
+    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
+    __m256d          one     = _mm256_set1_pd(1.0);
+    __m256d          two     = _mm256_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm256_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
+    beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
+    beta2            = _mm256_mul_pd(beta,beta);
+    beta3            = _mm256_mul_pd(beta,beta2);
+
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
+    iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
+    iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
+    vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+    vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm256_set1_pd(charge[inr+1]);
+    jq2              = _mm256_set1_pd(charge[inr+2]);
+    jq3              = _mm256_set1_pd(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
+    c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
+    c6grid_00        = _mm256_set1_pd(vdwgridioffsetptr0[vdwjidx0A]);
+    qq11             = _mm256_mul_pd(iq1,jq1);
+    qq12             = _mm256_mul_pd(iq1,jq2);
+    qq13             = _mm256_mul_pd(iq1,jq3);
+    qq21             = _mm256_mul_pd(iq2,jq1);
+    qq22             = _mm256_mul_pd(iq2,jq2);
+    qq23             = _mm256_mul_pd(iq2,jq3);
+    qq31             = _mm256_mul_pd(iq3,jq1);
+    qq32             = _mm256_mul_pd(iq3,jq2);
+    qq33             = _mm256_mul_pd(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                    &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm256_setzero_pd();
+        fiy0             = _mm256_setzero_pd();
+        fiz0             = _mm256_setzero_pd();
+        fix1             = _mm256_setzero_pd();
+        fiy1             = _mm256_setzero_pd();
+        fiz1             = _mm256_setzero_pd();
+        fix2             = _mm256_setzero_pd();
+        fiy2             = _mm256_setzero_pd();
+        fiz2             = _mm256_setzero_pd();
+        fix3             = _mm256_setzero_pd();
+        fiy3             = _mm256_setzero_pd();
+        fiz3             = _mm256_setzero_pd();
+
+        /* Reset potential sums */
+        velecsum         = _mm256_setzero_pd();
+        vvdwsum          = _mm256_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                                 &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+            dx11             = _mm256_sub_pd(ix1,jx1);
+            dy11             = _mm256_sub_pd(iy1,jy1);
+            dz11             = _mm256_sub_pd(iz1,jz1);
+            dx12             = _mm256_sub_pd(ix1,jx2);
+            dy12             = _mm256_sub_pd(iy1,jy2);
+            dz12             = _mm256_sub_pd(iz1,jz2);
+            dx13             = _mm256_sub_pd(ix1,jx3);
+            dy13             = _mm256_sub_pd(iy1,jy3);
+            dz13             = _mm256_sub_pd(iz1,jz3);
+            dx21             = _mm256_sub_pd(ix2,jx1);
+            dy21             = _mm256_sub_pd(iy2,jy1);
+            dz21             = _mm256_sub_pd(iz2,jz1);
+            dx22             = _mm256_sub_pd(ix2,jx2);
+            dy22             = _mm256_sub_pd(iy2,jy2);
+            dz22             = _mm256_sub_pd(iz2,jz2);
+            dx23             = _mm256_sub_pd(ix2,jx3);
+            dy23             = _mm256_sub_pd(iy2,jy3);
+            dz23             = _mm256_sub_pd(iz2,jz3);
+            dx31             = _mm256_sub_pd(ix3,jx1);
+            dy31             = _mm256_sub_pd(iy3,jy1);
+            dz31             = _mm256_sub_pd(iz3,jz1);
+            dx32             = _mm256_sub_pd(ix3,jx2);
+            dy32             = _mm256_sub_pd(iy3,jy2);
+            dz32             = _mm256_sub_pd(iz3,jz2);
+            dx33             = _mm256_sub_pd(ix3,jx3);
+            dy33             = _mm256_sub_pd(iy3,jy3);
+            dz33             = _mm256_sub_pd(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+            rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
+            rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
+            rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
+            rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
+            rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
+            rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
+            rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+            rinv11           = gmx_mm256_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm256_invsqrt_pd(rsq12);
+            rinv13           = gmx_mm256_invsqrt_pd(rsq13);
+            rinv21           = gmx_mm256_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm256_invsqrt_pd(rsq22);
+            rinv23           = gmx_mm256_invsqrt_pd(rsq23);
+            rinv31           = gmx_mm256_invsqrt_pd(rsq31);
+            rinv32           = gmx_mm256_invsqrt_pd(rsq32);
+            rinv33           = gmx_mm256_invsqrt_pd(rsq33);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+            rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
+            rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
+            rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
+            rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
+            rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
+            rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
+            rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
+
+            fjx0             = _mm256_setzero_pd();
+            fjy0             = _mm256_setzero_pd();
+            fjz0             = _mm256_setzero_pd();
+            fjx1             = _mm256_setzero_pd();
+            fjy1             = _mm256_setzero_pd();
+            fjz1             = _mm256_setzero_pd();
+            fjx2             = _mm256_setzero_pd();
+            fjy2             = _mm256_setzero_pd();
+            fjz2             = _mm256_setzero_pd();
+            fjx3             = _mm256_setzero_pd();
+            fjy3             = _mm256_setzero_pd();
+            fjz3             = _mm256_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm256_sub_pd(_mm256_mul_pd(vvdw12,one_twelfth),_mm256_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm256_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r11,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx11);
+            ty               = _mm256_mul_pd(fscal,dy11);
+            tz               = _mm256_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm256_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r12,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx12);
+            ty               = _mm256_mul_pd(fscal,dy12);
+            tz               = _mm256_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm256_mul_pd(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r13,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx13);
+            ty               = _mm256_mul_pd(fscal,dy13);
+            tz               = _mm256_mul_pd(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx3             = _mm256_add_pd(fjx3,tx);
+            fjy3             = _mm256_add_pd(fjy3,ty);
+            fjz3             = _mm256_add_pd(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm256_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r21,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx21);
+            ty               = _mm256_mul_pd(fscal,dy21);
+            tz               = _mm256_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm256_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r22,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx22);
+            ty               = _mm256_mul_pd(fscal,dy22);
+            tz               = _mm256_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm256_mul_pd(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r23,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx23);
+            ty               = _mm256_mul_pd(fscal,dy23);
+            tz               = _mm256_mul_pd(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx3             = _mm256_add_pd(fjx3,tx);
+            fjy3             = _mm256_add_pd(fjy3,ty);
+            fjz3             = _mm256_add_pd(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm256_mul_pd(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r31,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx31);
+            ty               = _mm256_mul_pd(fscal,dy31);
+            tz               = _mm256_mul_pd(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_pd(fix3,tx);
+            fiy3             = _mm256_add_pd(fiy3,ty);
+            fiz3             = _mm256_add_pd(fiz3,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm256_mul_pd(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r32,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx32);
+            ty               = _mm256_mul_pd(fscal,dy32);
+            tz               = _mm256_mul_pd(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_pd(fix3,tx);
+            fiy3             = _mm256_add_pd(fiy3,ty);
+            fiz3             = _mm256_add_pd(fiz3,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm256_mul_pd(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r33,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx33);
+            ty               = _mm256_mul_pd(fscal,dy33);
+            tz               = _mm256_mul_pd(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_pd(fix3,tx);
+            fiy3             = _mm256_add_pd(fiy3,ty);
+            fiz3             = _mm256_add_pd(fiz3,tz);
+
+            fjx3             = _mm256_add_pd(fjx3,tx);
+            fjy3             = _mm256_add_pd(fjy3,ty);
+            fjz3             = _mm256_add_pd(fjz3,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                      fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                      fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 426 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
+             */
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+
+            tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
+            tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
+            dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
+
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                                 &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+            dx11             = _mm256_sub_pd(ix1,jx1);
+            dy11             = _mm256_sub_pd(iy1,jy1);
+            dz11             = _mm256_sub_pd(iz1,jz1);
+            dx12             = _mm256_sub_pd(ix1,jx2);
+            dy12             = _mm256_sub_pd(iy1,jy2);
+            dz12             = _mm256_sub_pd(iz1,jz2);
+            dx13             = _mm256_sub_pd(ix1,jx3);
+            dy13             = _mm256_sub_pd(iy1,jy3);
+            dz13             = _mm256_sub_pd(iz1,jz3);
+            dx21             = _mm256_sub_pd(ix2,jx1);
+            dy21             = _mm256_sub_pd(iy2,jy1);
+            dz21             = _mm256_sub_pd(iz2,jz1);
+            dx22             = _mm256_sub_pd(ix2,jx2);
+            dy22             = _mm256_sub_pd(iy2,jy2);
+            dz22             = _mm256_sub_pd(iz2,jz2);
+            dx23             = _mm256_sub_pd(ix2,jx3);
+            dy23             = _mm256_sub_pd(iy2,jy3);
+            dz23             = _mm256_sub_pd(iz2,jz3);
+            dx31             = _mm256_sub_pd(ix3,jx1);
+            dy31             = _mm256_sub_pd(iy3,jy1);
+            dz31             = _mm256_sub_pd(iz3,jz1);
+            dx32             = _mm256_sub_pd(ix3,jx2);
+            dy32             = _mm256_sub_pd(iy3,jy2);
+            dz32             = _mm256_sub_pd(iz3,jz2);
+            dx33             = _mm256_sub_pd(ix3,jx3);
+            dy33             = _mm256_sub_pd(iy3,jy3);
+            dz33             = _mm256_sub_pd(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+            rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
+            rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
+            rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
+            rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
+            rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
+            rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
+            rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+            rinv11           = gmx_mm256_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm256_invsqrt_pd(rsq12);
+            rinv13           = gmx_mm256_invsqrt_pd(rsq13);
+            rinv21           = gmx_mm256_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm256_invsqrt_pd(rsq22);
+            rinv23           = gmx_mm256_invsqrt_pd(rsq23);
+            rinv31           = gmx_mm256_invsqrt_pd(rsq31);
+            rinv32           = gmx_mm256_invsqrt_pd(rsq32);
+            rinv33           = gmx_mm256_invsqrt_pd(rsq33);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+            rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
+            rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
+            rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
+            rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
+            rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
+            rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
+            rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
+
+            fjx0             = _mm256_setzero_pd();
+            fjy0             = _mm256_setzero_pd();
+            fjz0             = _mm256_setzero_pd();
+            fjx1             = _mm256_setzero_pd();
+            fjy1             = _mm256_setzero_pd();
+            fjz1             = _mm256_setzero_pd();
+            fjx2             = _mm256_setzero_pd();
+            fjy2             = _mm256_setzero_pd();
+            fjz2             = _mm256_setzero_pd();
+            fjx3             = _mm256_setzero_pd();
+            fjy3             = _mm256_setzero_pd();
+            fjz3             = _mm256_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+            r00              = _mm256_andnot_pd(dummy_mask,r00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm256_sub_pd(_mm256_mul_pd(vvdw12,one_twelfth),_mm256_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
+            vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm256_mul_pd(rsq11,rinv11);
+            r11              = _mm256_andnot_pd(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r11,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx11);
+            ty               = _mm256_mul_pd(fscal,dy11);
+            tz               = _mm256_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm256_mul_pd(rsq12,rinv12);
+            r12              = _mm256_andnot_pd(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r12,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx12);
+            ty               = _mm256_mul_pd(fscal,dy12);
+            tz               = _mm256_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm256_mul_pd(rsq13,rinv13);
+            r13              = _mm256_andnot_pd(dummy_mask,r13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r13,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq13,_mm256_sub_pd(rinv13,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx13);
+            ty               = _mm256_mul_pd(fscal,dy13);
+            tz               = _mm256_mul_pd(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx3             = _mm256_add_pd(fjx3,tx);
+            fjy3             = _mm256_add_pd(fjy3,ty);
+            fjz3             = _mm256_add_pd(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm256_mul_pd(rsq21,rinv21);
+            r21              = _mm256_andnot_pd(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r21,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx21);
+            ty               = _mm256_mul_pd(fscal,dy21);
+            tz               = _mm256_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm256_mul_pd(rsq22,rinv22);
+            r22              = _mm256_andnot_pd(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r22,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx22);
+            ty               = _mm256_mul_pd(fscal,dy22);
+            tz               = _mm256_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm256_mul_pd(rsq23,rinv23);
+            r23              = _mm256_andnot_pd(dummy_mask,r23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r23,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq23,_mm256_sub_pd(rinv23,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx23);
+            ty               = _mm256_mul_pd(fscal,dy23);
+            tz               = _mm256_mul_pd(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx3             = _mm256_add_pd(fjx3,tx);
+            fjy3             = _mm256_add_pd(fjy3,ty);
+            fjz3             = _mm256_add_pd(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm256_mul_pd(rsq31,rinv31);
+            r31              = _mm256_andnot_pd(dummy_mask,r31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r31,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq31,_mm256_sub_pd(rinv31,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx31);
+            ty               = _mm256_mul_pd(fscal,dy31);
+            tz               = _mm256_mul_pd(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_pd(fix3,tx);
+            fiy3             = _mm256_add_pd(fiy3,ty);
+            fiz3             = _mm256_add_pd(fiz3,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm256_mul_pd(rsq32,rinv32);
+            r32              = _mm256_andnot_pd(dummy_mask,r32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r32,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq32,_mm256_sub_pd(rinv32,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx32);
+            ty               = _mm256_mul_pd(fscal,dy32);
+            tz               = _mm256_mul_pd(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_pd(fix3,tx);
+            fiy3             = _mm256_add_pd(fiy3,ty);
+            fiz3             = _mm256_add_pd(fiz3,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm256_mul_pd(rsq33,rinv33);
+            r33              = _mm256_andnot_pd(dummy_mask,r33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r33,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
+            GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
+            velec            = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
+            velec            = _mm256_mul_pd(qq33,_mm256_sub_pd(rinv33,velec));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_pd(dummy_mask,velec);
+            velecsum         = _mm256_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx33);
+            ty               = _mm256_mul_pd(fscal,dy33);
+            tz               = _mm256_mul_pd(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_pd(fix3,tx);
+            fiy3             = _mm256_add_pd(fiy3,ty);
+            fiz3             = _mm256_add_pd(fiz3,tz);
+
+            fjx3             = _mm256_add_pd(fjx3,tx);
+            fjy3             = _mm256_add_pd(fjy3,ty);
+            fjz3             = _mm256_add_pd(fjz3,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                      fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                      fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 436 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*436);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_avx_256_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_avx_256_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    real *           vdwioffsetptr1;
+    real *           vdwgridioffsetptr1;
+    __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    real *           vdwioffsetptr2;
+    real *           vdwgridioffsetptr2;
+    __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    real *           vdwioffsetptr3;
+    real *           vdwgridioffsetptr3;
+    __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m256d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m256d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m256d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m256d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m256d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m256d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
+    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
+    __m256d           c6grid_00;
+    __m256d           c6grid_11;
+    __m256d           c6grid_12;
+    __m256d           c6grid_13;
+    __m256d           c6grid_21;
+    __m256d           c6grid_22;
+    __m256d           c6grid_23;
+    __m256d           c6grid_31;
+    __m256d           c6grid_32;
+    __m256d           c6grid_33;
+    real             *vdwgridparam;
+    __m256d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256d           one_half  = _mm256_set1_pd(0.5);
+    __m256d           minus_one = _mm256_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m256d          beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m256d          dummy_mask,cutoff_mask;
+    __m128           tmpmask0,tmpmask1;
+    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
+    __m256d          one     = _mm256_set1_pd(1.0);
+    __m256d          two     = _mm256_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm256_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm256_set1_pd(fr->ic->sh_ewald);
+    beta             = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
+    beta2            = _mm256_mul_pd(beta,beta);
+    beta3            = _mm256_mul_pd(beta,beta2);
+
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm256_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
+    iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
+    iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
+    vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+    vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm256_set1_pd(charge[inr+1]);
+    jq2              = _mm256_set1_pd(charge[inr+2]);
+    jq3              = _mm256_set1_pd(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
+    c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
+    c6grid_00        = _mm256_set1_pd(vdwgridioffsetptr0[vdwjidx0A]);
+    qq11             = _mm256_mul_pd(iq1,jq1);
+    qq12             = _mm256_mul_pd(iq1,jq2);
+    qq13             = _mm256_mul_pd(iq1,jq3);
+    qq21             = _mm256_mul_pd(iq2,jq1);
+    qq22             = _mm256_mul_pd(iq2,jq2);
+    qq23             = _mm256_mul_pd(iq2,jq3);
+    qq31             = _mm256_mul_pd(iq3,jq1);
+    qq32             = _mm256_mul_pd(iq3,jq2);
+    qq33             = _mm256_mul_pd(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                    &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm256_setzero_pd();
+        fiy0             = _mm256_setzero_pd();
+        fiz0             = _mm256_setzero_pd();
+        fix1             = _mm256_setzero_pd();
+        fiy1             = _mm256_setzero_pd();
+        fiz1             = _mm256_setzero_pd();
+        fix2             = _mm256_setzero_pd();
+        fiy2             = _mm256_setzero_pd();
+        fiz2             = _mm256_setzero_pd();
+        fix3             = _mm256_setzero_pd();
+        fiy3             = _mm256_setzero_pd();
+        fiz3             = _mm256_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                                 &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+            dx11             = _mm256_sub_pd(ix1,jx1);
+            dy11             = _mm256_sub_pd(iy1,jy1);
+            dz11             = _mm256_sub_pd(iz1,jz1);
+            dx12             = _mm256_sub_pd(ix1,jx2);
+            dy12             = _mm256_sub_pd(iy1,jy2);
+            dz12             = _mm256_sub_pd(iz1,jz2);
+            dx13             = _mm256_sub_pd(ix1,jx3);
+            dy13             = _mm256_sub_pd(iy1,jy3);
+            dz13             = _mm256_sub_pd(iz1,jz3);
+            dx21             = _mm256_sub_pd(ix2,jx1);
+            dy21             = _mm256_sub_pd(iy2,jy1);
+            dz21             = _mm256_sub_pd(iz2,jz1);
+            dx22             = _mm256_sub_pd(ix2,jx2);
+            dy22             = _mm256_sub_pd(iy2,jy2);
+            dz22             = _mm256_sub_pd(iz2,jz2);
+            dx23             = _mm256_sub_pd(ix2,jx3);
+            dy23             = _mm256_sub_pd(iy2,jy3);
+            dz23             = _mm256_sub_pd(iz2,jz3);
+            dx31             = _mm256_sub_pd(ix3,jx1);
+            dy31             = _mm256_sub_pd(iy3,jy1);
+            dz31             = _mm256_sub_pd(iz3,jz1);
+            dx32             = _mm256_sub_pd(ix3,jx2);
+            dy32             = _mm256_sub_pd(iy3,jy2);
+            dz32             = _mm256_sub_pd(iz3,jz2);
+            dx33             = _mm256_sub_pd(ix3,jx3);
+            dy33             = _mm256_sub_pd(iy3,jy3);
+            dz33             = _mm256_sub_pd(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+            rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
+            rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
+            rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
+            rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
+            rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
+            rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
+            rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+            rinv11           = gmx_mm256_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm256_invsqrt_pd(rsq12);
+            rinv13           = gmx_mm256_invsqrt_pd(rsq13);
+            rinv21           = gmx_mm256_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm256_invsqrt_pd(rsq22);
+            rinv23           = gmx_mm256_invsqrt_pd(rsq23);
+            rinv31           = gmx_mm256_invsqrt_pd(rsq31);
+            rinv32           = gmx_mm256_invsqrt_pd(rsq32);
+            rinv33           = gmx_mm256_invsqrt_pd(rsq33);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+            rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
+            rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
+            rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
+            rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
+            rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
+            rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
+            rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
+
+            fjx0             = _mm256_setzero_pd();
+            fjy0             = _mm256_setzero_pd();
+            fjz0             = _mm256_setzero_pd();
+            fjx1             = _mm256_setzero_pd();
+            fjy1             = _mm256_setzero_pd();
+            fjz1             = _mm256_setzero_pd();
+            fjx2             = _mm256_setzero_pd();
+            fjy2             = _mm256_setzero_pd();
+            fjz2             = _mm256_setzero_pd();
+            fjx3             = _mm256_setzero_pd();
+            fjy3             = _mm256_setzero_pd();
+            fjz3             = _mm256_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm256_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r11,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx11);
+            ty               = _mm256_mul_pd(fscal,dy11);
+            tz               = _mm256_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm256_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r12,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx12);
+            ty               = _mm256_mul_pd(fscal,dy12);
+            tz               = _mm256_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm256_mul_pd(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r13,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx13);
+            ty               = _mm256_mul_pd(fscal,dy13);
+            tz               = _mm256_mul_pd(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx3             = _mm256_add_pd(fjx3,tx);
+            fjy3             = _mm256_add_pd(fjy3,ty);
+            fjz3             = _mm256_add_pd(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm256_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r21,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx21);
+            ty               = _mm256_mul_pd(fscal,dy21);
+            tz               = _mm256_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm256_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r22,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx22);
+            ty               = _mm256_mul_pd(fscal,dy22);
+            tz               = _mm256_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm256_mul_pd(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r23,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx23);
+            ty               = _mm256_mul_pd(fscal,dy23);
+            tz               = _mm256_mul_pd(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx3             = _mm256_add_pd(fjx3,tx);
+            fjy3             = _mm256_add_pd(fjy3,ty);
+            fjz3             = _mm256_add_pd(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm256_mul_pd(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r31,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx31);
+            ty               = _mm256_mul_pd(fscal,dy31);
+            tz               = _mm256_mul_pd(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_pd(fix3,tx);
+            fiy3             = _mm256_add_pd(fiy3,ty);
+            fiz3             = _mm256_add_pd(fiz3,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm256_mul_pd(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r32,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx32);
+            ty               = _mm256_mul_pd(fscal,dy32);
+            tz               = _mm256_mul_pd(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_pd(fix3,tx);
+            fiy3             = _mm256_add_pd(fiy3,ty);
+            fiz3             = _mm256_add_pd(fiz3,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm256_mul_pd(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r33,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx33);
+            ty               = _mm256_mul_pd(fscal,dy33);
+            tz               = _mm256_mul_pd(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_pd(fix3,tx);
+            fiy3             = _mm256_add_pd(fiy3,ty);
+            fiz3             = _mm256_add_pd(fiz3,tz);
+
+            fjx3             = _mm256_add_pd(fjx3,tx);
+            fjy3             = _mm256_add_pd(fjy3,ty);
+            fjz3             = _mm256_add_pd(fjz3,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                      fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                      fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 373 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
+             */
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+
+            tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
+            tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
+            dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
+
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                                 &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+            dx11             = _mm256_sub_pd(ix1,jx1);
+            dy11             = _mm256_sub_pd(iy1,jy1);
+            dz11             = _mm256_sub_pd(iz1,jz1);
+            dx12             = _mm256_sub_pd(ix1,jx2);
+            dy12             = _mm256_sub_pd(iy1,jy2);
+            dz12             = _mm256_sub_pd(iz1,jz2);
+            dx13             = _mm256_sub_pd(ix1,jx3);
+            dy13             = _mm256_sub_pd(iy1,jy3);
+            dz13             = _mm256_sub_pd(iz1,jz3);
+            dx21             = _mm256_sub_pd(ix2,jx1);
+            dy21             = _mm256_sub_pd(iy2,jy1);
+            dz21             = _mm256_sub_pd(iz2,jz1);
+            dx22             = _mm256_sub_pd(ix2,jx2);
+            dy22             = _mm256_sub_pd(iy2,jy2);
+            dz22             = _mm256_sub_pd(iz2,jz2);
+            dx23             = _mm256_sub_pd(ix2,jx3);
+            dy23             = _mm256_sub_pd(iy2,jy3);
+            dz23             = _mm256_sub_pd(iz2,jz3);
+            dx31             = _mm256_sub_pd(ix3,jx1);
+            dy31             = _mm256_sub_pd(iy3,jy1);
+            dz31             = _mm256_sub_pd(iz3,jz1);
+            dx32             = _mm256_sub_pd(ix3,jx2);
+            dy32             = _mm256_sub_pd(iy3,jy2);
+            dz32             = _mm256_sub_pd(iz3,jz2);
+            dx33             = _mm256_sub_pd(ix3,jx3);
+            dy33             = _mm256_sub_pd(iy3,jy3);
+            dz33             = _mm256_sub_pd(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+            rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
+            rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
+            rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
+            rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
+            rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
+            rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
+            rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+            rinv11           = gmx_mm256_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm256_invsqrt_pd(rsq12);
+            rinv13           = gmx_mm256_invsqrt_pd(rsq13);
+            rinv21           = gmx_mm256_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm256_invsqrt_pd(rsq22);
+            rinv23           = gmx_mm256_invsqrt_pd(rsq23);
+            rinv31           = gmx_mm256_invsqrt_pd(rsq31);
+            rinv32           = gmx_mm256_invsqrt_pd(rsq32);
+            rinv33           = gmx_mm256_invsqrt_pd(rsq33);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+            rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
+            rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
+            rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
+            rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
+            rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
+            rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
+            rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
+
+            fjx0             = _mm256_setzero_pd();
+            fjy0             = _mm256_setzero_pd();
+            fjz0             = _mm256_setzero_pd();
+            fjx1             = _mm256_setzero_pd();
+            fjy1             = _mm256_setzero_pd();
+            fjz1             = _mm256_setzero_pd();
+            fjx2             = _mm256_setzero_pd();
+            fjy2             = _mm256_setzero_pd();
+            fjz2             = _mm256_setzero_pd();
+            fjx3             = _mm256_setzero_pd();
+            fjy3             = _mm256_setzero_pd();
+            fjz3             = _mm256_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+            r00              = _mm256_andnot_pd(dummy_mask,r00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjx0             = _mm256_add_pd(fjx0,tx);
+            fjy0             = _mm256_add_pd(fjy0,ty);
+            fjz0             = _mm256_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm256_mul_pd(rsq11,rinv11);
+            r11              = _mm256_andnot_pd(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r11,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx11);
+            ty               = _mm256_mul_pd(fscal,dy11);
+            tz               = _mm256_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm256_mul_pd(rsq12,rinv12);
+            r12              = _mm256_andnot_pd(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r12,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx12);
+            ty               = _mm256_mul_pd(fscal,dy12);
+            tz               = _mm256_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm256_mul_pd(rsq13,rinv13);
+            r13              = _mm256_andnot_pd(dummy_mask,r13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r13,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq13,rinv13),_mm256_sub_pd(rinvsq13,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx13);
+            ty               = _mm256_mul_pd(fscal,dy13);
+            tz               = _mm256_mul_pd(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_pd(fix1,tx);
+            fiy1             = _mm256_add_pd(fiy1,ty);
+            fiz1             = _mm256_add_pd(fiz1,tz);
+
+            fjx3             = _mm256_add_pd(fjx3,tx);
+            fjy3             = _mm256_add_pd(fjy3,ty);
+            fjz3             = _mm256_add_pd(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm256_mul_pd(rsq21,rinv21);
+            r21              = _mm256_andnot_pd(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r21,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx21);
+            ty               = _mm256_mul_pd(fscal,dy21);
+            tz               = _mm256_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm256_mul_pd(rsq22,rinv22);
+            r22              = _mm256_andnot_pd(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r22,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx22);
+            ty               = _mm256_mul_pd(fscal,dy22);
+            tz               = _mm256_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm256_mul_pd(rsq23,rinv23);
+            r23              = _mm256_andnot_pd(dummy_mask,r23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r23,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq23,rinv23),_mm256_sub_pd(rinvsq23,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx23);
+            ty               = _mm256_mul_pd(fscal,dy23);
+            tz               = _mm256_mul_pd(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_pd(fix2,tx);
+            fiy2             = _mm256_add_pd(fiy2,ty);
+            fiz2             = _mm256_add_pd(fiz2,tz);
+
+            fjx3             = _mm256_add_pd(fjx3,tx);
+            fjy3             = _mm256_add_pd(fjy3,ty);
+            fjz3             = _mm256_add_pd(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm256_mul_pd(rsq31,rinv31);
+            r31              = _mm256_andnot_pd(dummy_mask,r31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r31,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq31,rinv31),_mm256_sub_pd(rinvsq31,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx31);
+            ty               = _mm256_mul_pd(fscal,dy31);
+            tz               = _mm256_mul_pd(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_pd(fix3,tx);
+            fiy3             = _mm256_add_pd(fiy3,ty);
+            fiz3             = _mm256_add_pd(fiz3,tz);
+
+            fjx1             = _mm256_add_pd(fjx1,tx);
+            fjy1             = _mm256_add_pd(fjy1,ty);
+            fjz1             = _mm256_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm256_mul_pd(rsq32,rinv32);
+            r32              = _mm256_andnot_pd(dummy_mask,r32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r32,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq32,rinv32),_mm256_sub_pd(rinvsq32,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx32);
+            ty               = _mm256_mul_pd(fscal,dy32);
+            tz               = _mm256_mul_pd(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_pd(fix3,tx);
+            fiy3             = _mm256_add_pd(fiy3,ty);
+            fiz3             = _mm256_add_pd(fiz3,tz);
+
+            fjx2             = _mm256_add_pd(fjx2,tx);
+            fjy2             = _mm256_add_pd(fjy2,ty);
+            fjz2             = _mm256_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm256_mul_pd(rsq33,rinv33);
+            r33              = _mm256_andnot_pd(dummy_mask,r33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm256_mul_pd(r33,ewtabscale);
+            ewitab           = _mm256_cvttpd_epi32(ewrt);
+            eweps            = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
+                                            ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
+                                            &ewtabF,&ewtabFn);
+            felec            = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
+            felec            = _mm256_mul_pd(_mm256_mul_pd(qq33,rinv33),_mm256_sub_pd(rinvsq33,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx33);
+            ty               = _mm256_mul_pd(fscal,dy33);
+            tz               = _mm256_mul_pd(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_pd(fix3,tx);
+            fiy3             = _mm256_add_pd(fiy3,ty);
+            fiz3             = _mm256_add_pd(fiz3,tz);
+
+            fjx3             = _mm256_add_pd(fjx3,tx);
+            fjy3             = _mm256_add_pd(fjy3,ty);
+            fjz3             = _mm256_add_pd(fjz3,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                      fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                      fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 383 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*383);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_avx_256_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_avx_256_double.c
new file mode 100644 (file)
index 0000000..a07ba3d
--- /dev/null
@@ -0,0 +1,777 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_256_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_256_double.h"
+#include "kernelutil_x86_avx_256_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_avx_256_double
+ * Electrostatics interaction: None
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_avx_256_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
+    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
+    __m256d           c6grid_00;
+    real             *vdwgridparam;
+    __m256d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256d           one_half  = _mm256_set1_pd(0.5);
+    __m256d           minus_one = _mm256_set1_pd(-1.0);
+    __m256d          dummy_mask,cutoff_mask;
+    __m128           tmpmask0,tmpmask1;
+    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
+    __m256d          one     = _mm256_set1_pd(1.0);
+    __m256d          two     = _mm256_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
+
+    rcutoff_scalar   = fr->rvdw;
+    rcutoff          = _mm256_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm256_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm256_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm256_setzero_pd();
+        fiy0             = _mm256_setzero_pd();
+        fiz0             = _mm256_setzero_pd();
+
+        /* Load parameters for i particles */
+        vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+        vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        vvdwsum          = _mm256_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_add_pd(_mm256_mul_pd(c6_00,sh_vdw_invrcut6),_mm256_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
+            vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 62 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
+             */
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+
+            tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
+            tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
+            dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
+
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+            r00              = _mm256_andnot_pd(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_add_pd(_mm256_mul_pd(c6_00,sh_vdw_invrcut6),_mm256_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
+            vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
+            vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 63 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*63);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_avx_256_double
+ * Electrostatics interaction: None
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_avx_256_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
+    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
+    __m256d           c6grid_00;
+    real             *vdwgridparam;
+    __m256d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256d           one_half  = _mm256_set1_pd(0.5);
+    __m256d           minus_one = _mm256_set1_pd(-1.0);
+    __m256d          dummy_mask,cutoff_mask;
+    __m128           tmpmask0,tmpmask1;
+    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
+    __m256d          one     = _mm256_set1_pd(1.0);
+    __m256d          two     = _mm256_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
+
+    rcutoff_scalar   = fr->rvdw;
+    rcutoff          = _mm256_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm256_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm256_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm256_setzero_pd();
+        fiy0             = _mm256_setzero_pd();
+        fiz0             = _mm256_setzero_pd();
+
+        /* Load parameters for i particles */
+        vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+        vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = fvdw;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 49 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
+             */
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+
+            tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
+            tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
+            dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
+
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+            r00              = _mm256_andnot_pd(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = fvdw;
+
+            fscal            = _mm256_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 50 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 6 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*6 + inneriter*50);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecNone_VdwLJEw_GeomP1P1_avx_256_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/nb_kernel_ElecNone_VdwLJEw_GeomP1P1_avx_256_double.c
new file mode 100644 (file)
index 0000000..9372123
--- /dev/null
@@ -0,0 +1,723 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_256_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_256_double.h"
+#include "kernelutil_x86_avx_256_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_avx_256_double
+ * Electrostatics interaction: None
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_avx_256_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
+    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
+    __m256d           c6grid_00;
+    real             *vdwgridparam;
+    __m256d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256d           one_half  = _mm256_set1_pd(0.5);
+    __m256d           minus_one = _mm256_set1_pd(-1.0);
+    __m256d          dummy_mask,cutoff_mask;
+    __m128           tmpmask0,tmpmask1;
+    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
+    __m256d          one     = _mm256_set1_pd(1.0);
+    __m256d          two     = _mm256_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm256_setzero_pd();
+        fiy0             = _mm256_setzero_pd();
+        fiz0             = _mm256_setzero_pd();
+
+        /* Load parameters for i particles */
+        vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+        vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        vvdwsum          = _mm256_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm256_sub_pd(_mm256_mul_pd(vvdw12,one_twelfth),_mm256_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+            /* Inner loop uses 54 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
+             */
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+
+            tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
+            tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
+            dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
+
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+            r00              = _mm256_andnot_pd(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_pd(_mm256_sub_pd(c6_00,_mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm256_sub_pd(_mm256_mul_pd(vvdw12,one_twelfth),_mm256_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
+            vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+            /* Inner loop uses 55 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*55);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_avx_256_double
+ * Electrostatics interaction: None
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_avx_256_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
+    __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
+    __m256d           c6grid_00;
+    real             *vdwgridparam;
+    __m256d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256d           one_half  = _mm256_set1_pd(0.5);
+    __m256d           minus_one = _mm256_set1_pd(-1.0);
+    __m256d          dummy_mask,cutoff_mask;
+    __m128           tmpmask0,tmpmask1;
+    __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
+    __m256d          one     = _mm256_set1_pd(1.0);
+    __m256d          two     = _mm256_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm256_setzero_pd();
+        fiy0             = _mm256_setzero_pd();
+        fiz0             = _mm256_setzero_pd();
+
+        /* Load parameters for i particles */
+        vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+        vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+            /* Inner loop uses 46 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
+             */
+            tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+
+            tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
+            tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
+            dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
+
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_pd(ix0,jx0);
+            dy00             = _mm256_sub_pd(iy0,jy0);
+            dz00             = _mm256_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm256_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_pd(rsq00,rinv00);
+            r00              = _mm256_andnot_pd(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_pd(c6grid_00,_mm256_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_pd(_mm256_mul_pd(c6grid_00,one_sixth),_mm256_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),_mm256_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            fscal            = _mm256_andnot_pd(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_pd(fscal,dx00);
+            ty               = _mm256_mul_pd(fscal,dy00);
+            tz               = _mm256_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_pd(fix0,tx);
+            fiy0             = _mm256_add_pd(fiy0,ty);
+            fiz0             = _mm256_add_pd(fiz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+            /* Inner loop uses 47 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 6 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*6 + inneriter*47);
+}
index b7cec0c504a0552da16258d3283486d970c89fe8..94d6e3d1c98d86c25e5d3d147a65996c7489b9ad 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
 
 #include "../nb_kernel.h"
 
+nb_kernel_t nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_avx_256_double;
+nb_kernel_t nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_avx_256_double;
+nb_kernel_t nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_avx_256_double;
+nb_kernel_t nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_avx_256_double;
 nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_avx_256_double;
 nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_avx_256_double;
 nb_kernel_t nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_avx_256_double;
@@ -48,6 +52,16 @@ nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_avx_256_double;
 nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_avx_256_double;
 nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_avx_256_double;
 nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_avx_256_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_avx_256_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_avx_256_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_avx_256_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_avx_256_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_avx_256_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_avx_256_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_avx_256_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_avx_256_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_avx_256_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_avx_256_double;
 nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_avx_256_double;
 nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_avx_256_double;
 nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_avx_256_double;
@@ -78,6 +92,16 @@ nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_avx_256_double;
 nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_avx_256_double;
 nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_avx_256_double;
 nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_avx_256_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_avx_256_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_avx_256_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_avx_256_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_avx_256_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_avx_256_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_avx_256_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_avx_256_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_avx_256_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_avx_256_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_avx_256_double;
 nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_avx_256_double;
 nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_avx_256_double;
 nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_avx_256_double;
@@ -259,6 +283,10 @@ nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_avx_256_double;
 nb_kernel_info_t
     kernellist_avx_256_double[] =
 {
+    { nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_avx_256_double, "nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_avx_256_double", "avx_256_double", "None", "None", "LJEwald", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_avx_256_double, "nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_avx_256_double", "avx_256_double", "None", "None", "LJEwald", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_avx_256_double, "nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_avx_256_double", "avx_256_double", "None", "None", "LJEwald", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_avx_256_double, "nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_avx_256_double", "avx_256_double", "None", "None", "LJEwald", "PotentialShift", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_avx_256_double, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_avx_256_double", "avx_256_double", "None", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_avx_256_double, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_avx_256_double", "avx_256_double", "None", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_avx_256_double, "nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_avx_256_double", "avx_256_double", "None", "None", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
@@ -267,6 +295,16 @@ nb_kernel_info_t
     { nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_avx_256_double, "nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_avx_256_double", "avx_256_double", "None", "None", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_avx_256_double, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_avx_256_double", "avx_256_double", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_avx_256_double, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_avx_256_double", "avx_256_double", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_avx_256_double, "nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_avx_256_double", "avx_256_double", "Ewald", "None", "LJEwald", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_avx_256_double, "nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_avx_256_double", "avx_256_double", "Ewald", "None", "LJEwald", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_avx_256_double, "nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_avx_256_double", "avx_256_double", "Ewald", "None", "LJEwald", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_avx_256_double, "nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_avx_256_double", "avx_256_double", "Ewald", "None", "LJEwald", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_avx_256_double, "nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_avx_256_double", "avx_256_double", "Ewald", "None", "LJEwald", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_avx_256_double, "nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_avx_256_double", "avx_256_double", "Ewald", "None", "LJEwald", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_avx_256_double, "nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_avx_256_double", "avx_256_double", "Ewald", "None", "LJEwald", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_avx_256_double, "nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_avx_256_double", "avx_256_double", "Ewald", "None", "LJEwald", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_avx_256_double, "nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_avx_256_double", "avx_256_double", "Ewald", "None", "LJEwald", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_avx_256_double, "nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_avx_256_double", "avx_256_double", "Ewald", "None", "LJEwald", "None", "Water4Water4", "", "Force" },
     { nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_avx_256_double, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_avx_256_double", "avx_256_double", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_avx_256_double, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_avx_256_double", "avx_256_double", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_avx_256_double, "nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_avx_256_double", "avx_256_double", "Ewald", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
@@ -297,6 +335,16 @@ nb_kernel_info_t
     { nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_avx_256_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_avx_256_double", "avx_256_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
     { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_avx_256_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_avx_256_double", "avx_256_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
     { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_avx_256_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_avx_256_double", "avx_256_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_avx_256_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_avx_256_double", "avx_256_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_avx_256_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_avx_256_double", "avx_256_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_avx_256_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_avx_256_double", "avx_256_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_avx_256_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_avx_256_double", "avx_256_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_avx_256_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_avx_256_double", "avx_256_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_avx_256_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_avx_256_double", "avx_256_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_avx_256_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_avx_256_double", "avx_256_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_avx_256_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_avx_256_double", "avx_256_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_avx_256_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_avx_256_double", "avx_256_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_avx_256_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_avx_256_double", "avx_256_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water4Water4", "", "Force" },
     { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_avx_256_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_avx_256_double", "avx_256_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_avx_256_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_avx_256_double", "avx_256_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_avx_256_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_avx_256_double", "avx_256_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
index 2e5da27875dc7df920d22f6de999cc741dbbd58c..fc0fb545f55fcd7383bf34b3bdef3ffd3325cbbf 100644 (file)
@@ -2,7 +2,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -122,6 +122,9 @@ void
     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
     /* #for I in PARTICLES_I */
     real *           vdwioffsetptr{I};
+    /* #if 'LJEwald' in KERNEL_VDW */
+    real *           vdwgridioffsetptr{I};
+    /* #endif                      */
     __m256d          ix{I},iy{I},iz{I},fix{I},fiy{I},fiz{I},iq{I},isai{I};
     /* #endfor */
     /* #for J in PARTICLES_J */
@@ -155,6 +158,15 @@ void
     __m256d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
     real             *vftab;
     /* #endif */
+    /* #if 'LJEwald' in KERNEL_VDW */
+    /* #for I,J in PAIRS_IJ */
+    __m256d           c6grid_{I}{J};
+    /* #endfor */
+    real             *vdwgridparam;
+    __m256d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256d           one_half  = _mm256_set1_pd(0.5);
+    __m256d           minus_one = _mm256_set1_pd(-1.0);
+    /* #endif */
     /* #if 'Ewald' in KERNEL_ELEC */
     __m128i          ewitab;
     __m256d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
@@ -195,6 +207,12 @@ void
     vdwparam         = fr->nbfp;
     vdwtype          = mdatoms->typeA;
     /* #endif */
+    /* #if 'LJEwald' in KERNEL_VDW */
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_pd(minus_one,_mm256_mul_pd(ewclj,ewclj));
+    /* #endif */
 
     /* #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW */
     vftab            = kernel_data->table_elec_vdw->data;
@@ -240,6 +258,9 @@ void
     /*     #endfor */
     /*     #for I in PARTICLES_VDW_I */
     vdwioffsetptr{I}   = vdwparam+2*nvdwtype*vdwtype[inr+{I}];
+    /*         #if 'LJEwald' in KERNEL_VDW */
+    vdwgridioffsetptr{I} = vdwgridparam+2*nvdwtype*vdwtype[inr+{I}];
+    /*         #endif */
     /*     #endfor */
     /* #endif */
 
@@ -255,8 +276,14 @@ void
     qq{I}{J}             = _mm256_mul_pd(iq{I},jq{J});
     /*         #endif */
     /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
+    /*             #if 'LJEwald' in KERNEL_VDW */
+    c6_{I}{J}            = _mm256_set1_pd(vdwioffsetptr{I}[vdwjidx{J}A]);
+    c12_{I}{J}           = _mm256_set1_pd(vdwioffsetptr{I}[vdwjidx{J}A+1]);
+    c6grid_{I}{J}        = _mm256_set1_pd(vdwgridioffsetptr{I}[vdwjidx{J}A]);
+    /*             #else */
     c6_{I}{J}            = _mm256_set1_pd(vdwioffsetptr{I}[vdwjidx{J}A]);
     c12_{I}{J}           = _mm256_set1_pd(vdwioffsetptr{I}[vdwjidx{J}A+1]);
+    /*             #endif */
     /*         #endif */
     /*     #endfor */
     /* #endif */
@@ -365,6 +392,9 @@ void
         /*     #endfor */
         /*     #for I in PARTICLES_VDW_I */
         vdwioffsetptr{I}   = vdwparam+2*nvdwtype*vdwtype[inr+{I}];
+        /*         #if 'LJEwald' in KERNEL_VDW */
+        vdwgridioffsetptr{I} = vdwgridparam+2*nvdwtype*vdwtype[inr+{I}];
+        /*         #endif */
         /*     #endfor */
         /* #endif */
 
@@ -552,6 +582,13 @@ void
                                             vdwioffsetptr{I}+vdwjidx{J}C,
                                             vdwioffsetptr{I}+vdwjidx{J}D,
                                             &c6_{I}{J},&c12_{I}{J});
+
+            /*           #if 'LJEwald' in KERNEL_VDW */
+            c6grid_{I}{J}       = gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr{I}+vdwjidx{J}A,
+                                                                  vdwgridioffsetptr{I}+vdwjidx{J}B,
+                                                                  vdwgridioffsetptr{I}+vdwjidx{J}C,
+                                                                  vdwgridioffsetptr{I}+vdwjidx{J}D);
+            /*           #endif */
             /*         #endif */
             /*     #endif */
 
@@ -804,6 +841,45 @@ void
             fvdw             = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv{I}{J})));
             /*                 #define INNERFLOPS INNERFLOPS+4 */
             /*             #endif */
+
+           /*         #elif KERNEL_VDW=='LJEwald' */
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq{I}{J},rinvsq{I}{J}),rinvsq{I}{J});
+            ewcljrsq         = _mm256_mul_pd(ewclj2,rsq{I}{J});
+            ewclj6           = _mm256_mul_pd(ewclj2,_mm256_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_pd(exponent,_mm256_add_pd(_mm256_sub_pd(one,ewcljrsq),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /*                 #define INNERFLOPS INNERFLOPS+11 */
+            /*             #if 'Potential' in KERNEL_VF or KERNEL_MOD_VDW=='PotentialSwitch' */
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_pd(_mm256_sub_pd(c6_{I}{J},_mm256_mul_pd(c6grid_{I}{J},_mm256_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_pd(c12_{I}{J},_mm256_mul_pd(rinvsix,rinvsix));
+           /*                     #define INNERFLOPS INNERFLOPS+6 */
+           /*                 #if KERNEL_MOD_VDW=='PotentialShift' */
+            vvdw             = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_{I}{J},_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_add_pd(_mm256_mul_pd(c6_{I}{J},sh_vdw_invrcut6),_mm256_mul_pd(c6grid_{I}{J},sh_lj_ewald))),one_sixth));
+            /*                     #define INNERFLOPS INNERFLOPS+10 */
+            /*                 #else */
+            vvdw             = _mm256_sub_pd(_mm256_mul_pd(vvdw12,one_twelfth),_mm256_mul_pd(vvdw6,one_sixth));
+            /*                 #define INNERFLOPS INNERFLOPS+6 */
+           /*                 #endif */
+            /*                  ## Check for force inside potential check, i.e. this means we already did the potential part */
+            /*                  #if 'Force' in KERNEL_VF */
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,_mm256_sub_pd(vvdw6,_mm256_mul_pd(_mm256_mul_pd(c6grid_{I}{J},one_sixth),_mm256_mul_pd(exponent,ewclj6)))),rinvsq{I}{J});
+            /*                 #define INNERFLOPS INNERFLOPS+6 */
+            /*                  #endif */
+            /*              #elif KERNEL_VF=='Force' */
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_pd(c6grid_{I}{J},_mm256_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_pd(_mm256_mul_pd(c6grid_{I}{J},one_sixth),_mm256_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_{I}{J},rinvsix),_mm256_sub_pd(c6_{I}{J},f6A)),rinvsix),f6B),rinvsq{I}{J});
+            /*                 #define INNERFLOPS INNERFLOPS+11 */
+            /*              #endif */
             /*         #endif */
             /*         ## End of check for vdw interaction forms */
             /*     #endif */
index 1cecad31d7b26ac9520ef10d7210710237bfd9b8..7fbc8e565ac6bf737e77e4bf4752833b0d0fd5e8 100755 (executable)
@@ -2,7 +2,7 @@
 #
 # This file is part of the GROMACS molecular simulation package.
 #
-# Copyright (c) 2012,2013, by the GROMACS development team, led by
+# Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
 # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 # and including many others, as listed in the AUTHORS file in the
 # top-level source directory and at http://www.gromacs.org.
@@ -120,6 +120,7 @@ VdwList = {
     'LennardJones'            : ['rinvsq'],
 #    'Buckingham'              : ['rinv','rinvsq','r'], # Disabled for AVX_256 to reduce number of kernels and simply the template
     'CubicSplineTable'        : ['rinv','r','table'],
+    'LJEwald'                 : ['rinv','rinvsq','r'],
 }
 
 
@@ -193,6 +194,7 @@ Abbreviation = {
     'CubicSplineTable'        : 'CSTab',
     'LennardJones'            : 'LJ',
     'Buckingham'              : 'Bham',
+    'LJEwald'                 : 'LJEw',
     'PotentialShift'          : 'Sh',
     'PotentialSwitch'         : 'Sw',
     'ExactCutoff'             : 'Cut',
@@ -282,7 +284,11 @@ def KeepKernel(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,Kernel
         return 0
     # For Vdw, we support switch and shift for Lennard-Jones/Buckingham
     if((KernelVdwMod=='ExactCutoff') or
-       (KernelVdwMod in ['PotentialShift','PotentialSwitch'] and KernelVdw not in ['LennardJones','Buckingham'])):
+       (KernelVdwMod in ['PotentialShift','PotentialSwitch'] and KernelVdw not in ['LennardJones','Buckingham','LJEwald'])):
+        return 0
+
+    # For LJEwald, we only support shift
+    if(KernelVdw=='LJEwald' and KernelVdwMod=='PotentialSwitch'):
         return 0
 
     # Choose either switch or shift and don't mix them...
@@ -302,6 +308,10 @@ def KeepKernel(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,Kernel
         elif(KernelElec!='ReactionField'):
             return 0
 
+    #Only do LJ-PME if we are also doing PME for electrostatics, or no electrostatics at all.
+    if(KernelVdw=='LJEwald' and KernelElec not in ['Ewald','None']):
+        return 0
+
     return 1
 
 
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_avx_256_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_avx_256_single.c
new file mode 100644 (file)
index 0000000..5eca689
--- /dev/null
@@ -0,0 +1,1004 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_256_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_256_single.h"
+#include "kernelutil_x86_avx_256_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_avx_256_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_avx_256_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrE,jnrF,jnrG,jnrH;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
+    real             scratch[4*DIM];
+    __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
+    __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m256           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
+    __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
+    __m256           c6grid_00;
+    real             *vdwgridparam;
+    __m256           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256           one_half  = _mm256_set1_ps(0.5);
+    __m256           minus_one = _mm256_set1_ps(-1.0);
+    __m256i          ewitab;
+    __m128i          ewitab_lo,ewitab_hi;
+    __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m256           dummy_mask,cutoff_mask;
+    __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
+    __m256           one     = _mm256_set1_ps(1.0);
+    __m256           two     = _mm256_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm256_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_ps(minus_one,_mm256_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
+    beta             = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
+    beta2            = _mm256_mul_ps(beta,beta);
+    beta3            = _mm256_mul_ps(beta,beta2);
+
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm256_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm256_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm256_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+    j_coord_offsetE = 0;
+    j_coord_offsetF = 0;
+    j_coord_offsetG = 0;
+    j_coord_offsetH = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm256_setzero_ps();
+        fiy0             = _mm256_setzero_ps();
+        fiz0             = _mm256_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
+        vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+        vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _mm256_setzero_ps();
+        vvdwsum          = _mm256_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            jnrE             = jjnr[jidx+4];
+            jnrF             = jjnr[jidx+5];
+            jnrG             = jjnr[jidx+6];
+            jnrH             = jjnr[jidx+7];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0,
+                                                                 charge+jnrE+0,charge+jnrF+0,
+                                                                 charge+jnrG+0,charge+jnrH+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+            vdwjidx0E        = 2*vdwtype[jnrE+0];
+            vdwjidx0F        = 2*vdwtype[jnrF+0];
+            vdwjidx0G        = 2*vdwtype[jnrG+0];
+            vdwjidx0H        = 2*vdwtype[jnrH+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm256_mul_ps(iq0,jq0);
+            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            vdwioffsetptr0+vdwjidx0E,
+                                            vdwioffsetptr0+vdwjidx0F,
+                                            vdwioffsetptr0+vdwjidx0G,
+                                            vdwioffsetptr0+vdwjidx0H,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_8real_swizzle_ps(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D,
+                                                                  vdwgridioffsetptr0+vdwjidx0E,
+                                                                  vdwgridioffsetptr0+vdwjidx0F,
+                                                                  vdwgridioffsetptr0+vdwjidx0G,
+                                                                  vdwgridioffsetptr0+vdwjidx0H);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq00);
+            rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq00,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv00,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq00,velec);
+            
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_ps(_mm256_sub_ps(c6_00,_mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
+           vvdw             = _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12 , _mm256_mul_ps(c12_00,_mm256_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm256_mul_ps( _mm256_sub_ps(vvdw6,_mm256_add_ps(_mm256_mul_ps(c6_00,sh_vdw_invrcut6),_mm256_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,_mm256_sub_ps(vvdw6,_mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+            vvdw             = _mm256_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm256_add_ps(felec,fvdw);
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            fjptrE             = f+j_coord_offsetE;
+            fjptrF             = f+j_coord_offsetF;
+            fjptrG             = f+j_coord_offsetG;
+            fjptrH             = f+j_coord_offsetH;
+            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 145 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            jnrlistE         = jjnr[jidx+4];
+            jnrlistF         = jjnr[jidx+5];
+            jnrlistG         = jjnr[jidx+6];
+            jnrlistH         = jjnr[jidx+7];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
+                                            gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
+                                            
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
+            jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
+            jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
+            jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0,
+                                                                 charge+jnrE+0,charge+jnrF+0,
+                                                                 charge+jnrG+0,charge+jnrH+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+            vdwjidx0E        = 2*vdwtype[jnrE+0];
+            vdwjidx0F        = 2*vdwtype[jnrF+0];
+            vdwjidx0G        = 2*vdwtype[jnrG+0];
+            vdwjidx0H        = 2*vdwtype[jnrH+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+            r00              = _mm256_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm256_mul_ps(iq0,jq0);
+            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            vdwioffsetptr0+vdwjidx0E,
+                                            vdwioffsetptr0+vdwjidx0F,
+                                            vdwioffsetptr0+vdwjidx0G,
+                                            vdwioffsetptr0+vdwjidx0H,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_8real_swizzle_ps(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D,
+                                                                  vdwgridioffsetptr0+vdwjidx0E,
+                                                                  vdwgridioffsetptr0+vdwjidx0F,
+                                                                  vdwgridioffsetptr0+vdwjidx0G,
+                                                                  vdwgridioffsetptr0+vdwjidx0H);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq00);
+            rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq00,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv00,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq00,velec);
+            
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_ps(_mm256_sub_ps(c6_00,_mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
+           vvdw             = _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12 , _mm256_mul_ps(c12_00,_mm256_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm256_mul_ps( _mm256_sub_ps(vvdw6,_mm256_add_ps(_mm256_mul_ps(c6_00,sh_vdw_invrcut6),_mm256_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,_mm256_sub_ps(vvdw6,_mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+            vvdw             = _mm256_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm256_add_ps(felec,fvdw);
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
+            fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
+            fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
+            fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
+            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 146 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*146);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_avx_256_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_avx_256_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrE,jnrF,jnrG,jnrH;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
+    real             scratch[4*DIM];
+    __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
+    __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m256           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
+    __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
+    __m256           c6grid_00;
+    real             *vdwgridparam;
+    __m256           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256           one_half  = _mm256_set1_ps(0.5);
+    __m256           minus_one = _mm256_set1_ps(-1.0);
+    __m256i          ewitab;
+    __m128i          ewitab_lo,ewitab_hi;
+    __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m256           dummy_mask,cutoff_mask;
+    __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
+    __m256           one     = _mm256_set1_ps(1.0);
+    __m256           two     = _mm256_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm256_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_ps(minus_one,_mm256_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
+    beta             = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
+    beta2            = _mm256_mul_ps(beta,beta);
+    beta3            = _mm256_mul_ps(beta,beta2);
+
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm256_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm256_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm256_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+    j_coord_offsetE = 0;
+    j_coord_offsetF = 0;
+    j_coord_offsetG = 0;
+    j_coord_offsetH = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm256_setzero_ps();
+        fiy0             = _mm256_setzero_ps();
+        fiz0             = _mm256_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
+        vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+        vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            jnrE             = jjnr[jidx+4];
+            jnrF             = jjnr[jidx+5];
+            jnrG             = jjnr[jidx+6];
+            jnrH             = jjnr[jidx+7];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0,
+                                                                 charge+jnrE+0,charge+jnrF+0,
+                                                                 charge+jnrG+0,charge+jnrH+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+            vdwjidx0E        = 2*vdwtype[jnrE+0];
+            vdwjidx0F        = 2*vdwtype[jnrF+0];
+            vdwjidx0G        = 2*vdwtype[jnrG+0];
+            vdwjidx0H        = 2*vdwtype[jnrH+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm256_mul_ps(iq0,jq0);
+            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            vdwioffsetptr0+vdwjidx0E,
+                                            vdwioffsetptr0+vdwjidx0F,
+                                            vdwioffsetptr0+vdwjidx0G,
+                                            vdwioffsetptr0+vdwjidx0H,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_8real_swizzle_ps(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D,
+                                                                  vdwgridioffsetptr0+vdwjidx0E,
+                                                                  vdwgridioffsetptr0+vdwjidx0F,
+                                                                  vdwgridioffsetptr0+vdwjidx0G,
+                                                                  vdwgridioffsetptr0+vdwjidx0H);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq00);
+            rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq00,felec);
+            
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),_mm256_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = _mm256_add_ps(felec,fvdw);
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            fjptrE             = f+j_coord_offsetE;
+            fjptrF             = f+j_coord_offsetF;
+            fjptrG             = f+j_coord_offsetG;
+            fjptrH             = f+j_coord_offsetH;
+            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 82 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            jnrlistE         = jjnr[jidx+4];
+            jnrlistF         = jjnr[jidx+5];
+            jnrlistG         = jjnr[jidx+6];
+            jnrlistH         = jjnr[jidx+7];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
+                                            gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
+                                            
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
+            jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
+            jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
+            jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0,
+                                                                 charge+jnrE+0,charge+jnrF+0,
+                                                                 charge+jnrG+0,charge+jnrH+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+            vdwjidx0E        = 2*vdwtype[jnrE+0];
+            vdwjidx0F        = 2*vdwtype[jnrF+0];
+            vdwjidx0G        = 2*vdwtype[jnrG+0];
+            vdwjidx0H        = 2*vdwtype[jnrH+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+            r00              = _mm256_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm256_mul_ps(iq0,jq0);
+            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            vdwioffsetptr0+vdwjidx0E,
+                                            vdwioffsetptr0+vdwjidx0F,
+                                            vdwioffsetptr0+vdwjidx0G,
+                                            vdwioffsetptr0+vdwjidx0H,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_8real_swizzle_ps(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D,
+                                                                  vdwgridioffsetptr0+vdwjidx0E,
+                                                                  vdwgridioffsetptr0+vdwjidx0F,
+                                                                  vdwgridioffsetptr0+vdwjidx0G,
+                                                                  vdwgridioffsetptr0+vdwjidx0H);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq00);
+            rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq00,felec);
+            
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),_mm256_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = _mm256_add_ps(felec,fvdw);
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
+            fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
+            fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
+            fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
+            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 83 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*83);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_avx_256_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_avx_256_single.c
new file mode 100644 (file)
index 0000000..cd8a4e4
--- /dev/null
@@ -0,0 +1,1518 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_256_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_256_single.h"
+#include "kernelutil_x86_avx_256_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_avx_256_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_avx_256_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrE,jnrF,jnrG,jnrH;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
+    real             scratch[4*DIM];
+    __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    real *           vdwioffsetptr1;
+    real *           vdwgridioffsetptr1;
+    __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    real *           vdwioffsetptr2;
+    real *           vdwgridioffsetptr2;
+    __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
+    __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m256           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
+    __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
+    __m256           c6grid_00;
+    __m256           c6grid_10;
+    __m256           c6grid_20;
+    real             *vdwgridparam;
+    __m256           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256           one_half  = _mm256_set1_ps(0.5);
+    __m256           minus_one = _mm256_set1_ps(-1.0);
+    __m256i          ewitab;
+    __m128i          ewitab_lo,ewitab_hi;
+    __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m256           dummy_mask,cutoff_mask;
+    __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
+    __m256           one     = _mm256_set1_ps(1.0);
+    __m256           two     = _mm256_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm256_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_ps(minus_one,_mm256_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
+    beta             = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
+    beta2            = _mm256_mul_ps(beta,beta);
+    beta3            = _mm256_mul_ps(beta,beta2);
+
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
+    iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
+    iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
+    vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+    vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm256_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm256_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm256_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+    j_coord_offsetE = 0;
+    j_coord_offsetF = 0;
+    j_coord_offsetG = 0;
+    j_coord_offsetH = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                    &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm256_setzero_ps();
+        fiy0             = _mm256_setzero_ps();
+        fiz0             = _mm256_setzero_ps();
+        fix1             = _mm256_setzero_ps();
+        fiy1             = _mm256_setzero_ps();
+        fiz1             = _mm256_setzero_ps();
+        fix2             = _mm256_setzero_ps();
+        fiy2             = _mm256_setzero_ps();
+        fiz2             = _mm256_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm256_setzero_ps();
+        vvdwsum          = _mm256_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            jnrE             = jjnr[jidx+4];
+            jnrF             = jjnr[jidx+5];
+            jnrG             = jjnr[jidx+6];
+            jnrH             = jjnr[jidx+7];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+            dx10             = _mm256_sub_ps(ix1,jx0);
+            dy10             = _mm256_sub_ps(iy1,jy0);
+            dz10             = _mm256_sub_ps(iz1,jz0);
+            dx20             = _mm256_sub_ps(ix2,jx0);
+            dy20             = _mm256_sub_ps(iy2,jy0);
+            dz20             = _mm256_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm256_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm256_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0,
+                                                                 charge+jnrE+0,charge+jnrF+0,
+                                                                 charge+jnrG+0,charge+jnrH+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+            vdwjidx0E        = 2*vdwtype[jnrE+0];
+            vdwjidx0F        = 2*vdwtype[jnrF+0];
+            vdwjidx0G        = 2*vdwtype[jnrG+0];
+            vdwjidx0H        = 2*vdwtype[jnrH+0];
+
+            fjx0             = _mm256_setzero_ps();
+            fjy0             = _mm256_setzero_ps();
+            fjz0             = _mm256_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm256_mul_ps(iq0,jq0);
+            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            vdwioffsetptr0+vdwjidx0E,
+                                            vdwioffsetptr0+vdwjidx0F,
+                                            vdwioffsetptr0+vdwjidx0G,
+                                            vdwioffsetptr0+vdwjidx0H,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_8real_swizzle_ps(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D,
+                                                                  vdwgridioffsetptr0+vdwjidx0E,
+                                                                  vdwgridioffsetptr0+vdwjidx0F,
+                                                                  vdwgridioffsetptr0+vdwjidx0G,
+                                                                  vdwgridioffsetptr0+vdwjidx0H);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq00);
+            rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq00,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv00,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq00,velec);
+            
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_ps(_mm256_sub_ps(c6_00,_mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
+           vvdw             = _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12 , _mm256_mul_ps(c12_00,_mm256_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm256_mul_ps( _mm256_sub_ps(vvdw6,_mm256_add_ps(_mm256_mul_ps(c6_00,sh_vdw_invrcut6),_mm256_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,_mm256_sub_ps(vvdw6,_mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+            vvdw             = _mm256_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm256_add_ps(felec,fvdw);
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm256_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm256_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq10);
+            rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq10,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv10,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq10,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx10);
+            ty               = _mm256_mul_ps(fscal,dy10);
+            tz               = _mm256_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm256_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm256_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq20);
+            rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq20,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv20,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq20,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx20);
+            ty               = _mm256_mul_ps(fscal,dy20);
+            tz               = _mm256_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            fjptrE             = f+j_coord_offsetE;
+            fjptrF             = f+j_coord_offsetF;
+            fjptrG             = f+j_coord_offsetG;
+            fjptrH             = f+j_coord_offsetH;
+
+            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 366 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            jnrlistE         = jjnr[jidx+4];
+            jnrlistF         = jjnr[jidx+5];
+            jnrlistG         = jjnr[jidx+6];
+            jnrlistH         = jjnr[jidx+7];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
+                                            gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
+                                            
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
+            jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
+            jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
+            jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+            dx10             = _mm256_sub_ps(ix1,jx0);
+            dy10             = _mm256_sub_ps(iy1,jy0);
+            dz10             = _mm256_sub_ps(iz1,jz0);
+            dx20             = _mm256_sub_ps(ix2,jx0);
+            dy20             = _mm256_sub_ps(iy2,jy0);
+            dz20             = _mm256_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm256_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm256_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0,
+                                                                 charge+jnrE+0,charge+jnrF+0,
+                                                                 charge+jnrG+0,charge+jnrH+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+            vdwjidx0E        = 2*vdwtype[jnrE+0];
+            vdwjidx0F        = 2*vdwtype[jnrF+0];
+            vdwjidx0G        = 2*vdwtype[jnrG+0];
+            vdwjidx0H        = 2*vdwtype[jnrH+0];
+
+            fjx0             = _mm256_setzero_ps();
+            fjy0             = _mm256_setzero_ps();
+            fjz0             = _mm256_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+            r00              = _mm256_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm256_mul_ps(iq0,jq0);
+            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            vdwioffsetptr0+vdwjidx0E,
+                                            vdwioffsetptr0+vdwjidx0F,
+                                            vdwioffsetptr0+vdwjidx0G,
+                                            vdwioffsetptr0+vdwjidx0H,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_8real_swizzle_ps(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D,
+                                                                  vdwgridioffsetptr0+vdwjidx0E,
+                                                                  vdwgridioffsetptr0+vdwjidx0F,
+                                                                  vdwgridioffsetptr0+vdwjidx0G,
+                                                                  vdwgridioffsetptr0+vdwjidx0H);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq00);
+            rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq00,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv00,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq00,velec);
+            
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_ps(_mm256_sub_ps(c6_00,_mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
+           vvdw             = _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12 , _mm256_mul_ps(c12_00,_mm256_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm256_mul_ps( _mm256_sub_ps(vvdw6,_mm256_add_ps(_mm256_mul_ps(c6_00,sh_vdw_invrcut6),_mm256_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,_mm256_sub_ps(vvdw6,_mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+            vvdw             = _mm256_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm256_add_ps(felec,fvdw);
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm256_mul_ps(rsq10,rinv10);
+            r10              = _mm256_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm256_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq10);
+            rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq10,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv10,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq10,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx10);
+            ty               = _mm256_mul_ps(fscal,dy10);
+            tz               = _mm256_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm256_mul_ps(rsq20,rinv20);
+            r20              = _mm256_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm256_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq20);
+            rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq20,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv20,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq20,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx20);
+            ty               = _mm256_mul_ps(fscal,dy20);
+            tz               = _mm256_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
+            fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
+            fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
+            fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
+
+            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 369 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*369);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_avx_256_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_avx_256_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrE,jnrF,jnrG,jnrH;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
+    real             scratch[4*DIM];
+    __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    real *           vdwioffsetptr1;
+    real *           vdwgridioffsetptr1;
+    __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    real *           vdwioffsetptr2;
+    real *           vdwgridioffsetptr2;
+    __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
+    __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m256           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
+    __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
+    __m256           c6grid_00;
+    __m256           c6grid_10;
+    __m256           c6grid_20;
+    real             *vdwgridparam;
+    __m256           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256           one_half  = _mm256_set1_ps(0.5);
+    __m256           minus_one = _mm256_set1_ps(-1.0);
+    __m256i          ewitab;
+    __m128i          ewitab_lo,ewitab_hi;
+    __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m256           dummy_mask,cutoff_mask;
+    __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
+    __m256           one     = _mm256_set1_ps(1.0);
+    __m256           two     = _mm256_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm256_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_ps(minus_one,_mm256_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
+    beta             = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
+    beta2            = _mm256_mul_ps(beta,beta);
+    beta3            = _mm256_mul_ps(beta,beta2);
+
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
+    iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
+    iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
+    vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+    vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm256_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm256_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm256_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+    j_coord_offsetE = 0;
+    j_coord_offsetF = 0;
+    j_coord_offsetG = 0;
+    j_coord_offsetH = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                    &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm256_setzero_ps();
+        fiy0             = _mm256_setzero_ps();
+        fiz0             = _mm256_setzero_ps();
+        fix1             = _mm256_setzero_ps();
+        fiy1             = _mm256_setzero_ps();
+        fiz1             = _mm256_setzero_ps();
+        fix2             = _mm256_setzero_ps();
+        fiy2             = _mm256_setzero_ps();
+        fiz2             = _mm256_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            jnrE             = jjnr[jidx+4];
+            jnrF             = jjnr[jidx+5];
+            jnrG             = jjnr[jidx+6];
+            jnrH             = jjnr[jidx+7];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+            dx10             = _mm256_sub_ps(ix1,jx0);
+            dy10             = _mm256_sub_ps(iy1,jy0);
+            dz10             = _mm256_sub_ps(iz1,jz0);
+            dx20             = _mm256_sub_ps(ix2,jx0);
+            dy20             = _mm256_sub_ps(iy2,jy0);
+            dz20             = _mm256_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm256_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm256_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0,
+                                                                 charge+jnrE+0,charge+jnrF+0,
+                                                                 charge+jnrG+0,charge+jnrH+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+            vdwjidx0E        = 2*vdwtype[jnrE+0];
+            vdwjidx0F        = 2*vdwtype[jnrF+0];
+            vdwjidx0G        = 2*vdwtype[jnrG+0];
+            vdwjidx0H        = 2*vdwtype[jnrH+0];
+
+            fjx0             = _mm256_setzero_ps();
+            fjy0             = _mm256_setzero_ps();
+            fjz0             = _mm256_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm256_mul_ps(iq0,jq0);
+            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            vdwioffsetptr0+vdwjidx0E,
+                                            vdwioffsetptr0+vdwjidx0F,
+                                            vdwioffsetptr0+vdwjidx0G,
+                                            vdwioffsetptr0+vdwjidx0H,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_8real_swizzle_ps(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D,
+                                                                  vdwgridioffsetptr0+vdwjidx0E,
+                                                                  vdwgridioffsetptr0+vdwjidx0F,
+                                                                  vdwgridioffsetptr0+vdwjidx0G,
+                                                                  vdwgridioffsetptr0+vdwjidx0H);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq00);
+            rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq00,felec);
+            
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),_mm256_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = _mm256_add_ps(felec,fvdw);
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm256_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm256_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq10);
+            rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq10,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx10);
+            ty               = _mm256_mul_ps(fscal,dy10);
+            tz               = _mm256_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm256_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm256_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq20);
+            rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq20,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx20);
+            ty               = _mm256_mul_ps(fscal,dy20);
+            tz               = _mm256_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            fjptrE             = f+j_coord_offsetE;
+            fjptrF             = f+j_coord_offsetF;
+            fjptrG             = f+j_coord_offsetG;
+            fjptrH             = f+j_coord_offsetH;
+
+            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 203 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            jnrlistE         = jjnr[jidx+4];
+            jnrlistF         = jjnr[jidx+5];
+            jnrlistG         = jjnr[jidx+6];
+            jnrlistH         = jjnr[jidx+7];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
+                                            gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
+                                            
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
+            jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
+            jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
+            jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+            dx10             = _mm256_sub_ps(ix1,jx0);
+            dy10             = _mm256_sub_ps(iy1,jy0);
+            dz10             = _mm256_sub_ps(iz1,jz0);
+            dx20             = _mm256_sub_ps(ix2,jx0);
+            dy20             = _mm256_sub_ps(iy2,jy0);
+            dz20             = _mm256_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm256_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm256_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0,
+                                                                 charge+jnrE+0,charge+jnrF+0,
+                                                                 charge+jnrG+0,charge+jnrH+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+            vdwjidx0E        = 2*vdwtype[jnrE+0];
+            vdwjidx0F        = 2*vdwtype[jnrF+0];
+            vdwjidx0G        = 2*vdwtype[jnrG+0];
+            vdwjidx0H        = 2*vdwtype[jnrH+0];
+
+            fjx0             = _mm256_setzero_ps();
+            fjy0             = _mm256_setzero_ps();
+            fjz0             = _mm256_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+            r00              = _mm256_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm256_mul_ps(iq0,jq0);
+            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            vdwioffsetptr0+vdwjidx0E,
+                                            vdwioffsetptr0+vdwjidx0F,
+                                            vdwioffsetptr0+vdwjidx0G,
+                                            vdwioffsetptr0+vdwjidx0H,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_8real_swizzle_ps(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D,
+                                                                  vdwgridioffsetptr0+vdwjidx0E,
+                                                                  vdwgridioffsetptr0+vdwjidx0F,
+                                                                  vdwgridioffsetptr0+vdwjidx0G,
+                                                                  vdwgridioffsetptr0+vdwjidx0H);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq00);
+            rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq00,felec);
+            
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),_mm256_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = _mm256_add_ps(felec,fvdw);
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm256_mul_ps(rsq10,rinv10);
+            r10              = _mm256_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm256_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq10);
+            rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq10,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx10);
+            ty               = _mm256_mul_ps(fscal,dy10);
+            tz               = _mm256_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm256_mul_ps(rsq20,rinv20);
+            r20              = _mm256_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm256_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq20);
+            rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq20,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx20);
+            ty               = _mm256_mul_ps(fscal,dy20);
+            tz               = _mm256_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
+            fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
+            fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
+            fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
+
+            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 206 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*206);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_avx_256_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_avx_256_single.c
new file mode 100644 (file)
index 0000000..6b8296b
--- /dev/null
@@ -0,0 +1,2690 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_256_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_256_single.h"
+#include "kernelutil_x86_avx_256_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_avx_256_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_avx_256_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrE,jnrF,jnrG,jnrH;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
+    real             scratch[4*DIM];
+    __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    real *           vdwioffsetptr1;
+    real *           vdwgridioffsetptr1;
+    __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    real *           vdwioffsetptr2;
+    real *           vdwgridioffsetptr2;
+    __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
+    __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
+    __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
+    __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m256           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
+    __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
+    __m256           c6grid_00;
+    __m256           c6grid_01;
+    __m256           c6grid_02;
+    __m256           c6grid_10;
+    __m256           c6grid_11;
+    __m256           c6grid_12;
+    __m256           c6grid_20;
+    __m256           c6grid_21;
+    __m256           c6grid_22;
+    real             *vdwgridparam;
+    __m256           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256           one_half  = _mm256_set1_ps(0.5);
+    __m256           minus_one = _mm256_set1_ps(-1.0);
+    __m256i          ewitab;
+    __m128i          ewitab_lo,ewitab_hi;
+    __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m256           dummy_mask,cutoff_mask;
+    __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
+    __m256           one     = _mm256_set1_ps(1.0);
+    __m256           two     = _mm256_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm256_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_ps(minus_one,_mm256_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
+    beta             = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
+    beta2            = _mm256_mul_ps(beta,beta);
+    beta3            = _mm256_mul_ps(beta,beta2);
+
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
+    iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
+    iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
+    vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+    vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm256_set1_ps(charge[inr+0]);
+    jq1              = _mm256_set1_ps(charge[inr+1]);
+    jq2              = _mm256_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm256_mul_ps(iq0,jq0);
+    c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
+    c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
+    c6grid_00        = _mm256_set1_ps(vdwgridioffsetptr0[vdwjidx0A]);
+    qq01             = _mm256_mul_ps(iq0,jq1);
+    qq02             = _mm256_mul_ps(iq0,jq2);
+    qq10             = _mm256_mul_ps(iq1,jq0);
+    qq11             = _mm256_mul_ps(iq1,jq1);
+    qq12             = _mm256_mul_ps(iq1,jq2);
+    qq20             = _mm256_mul_ps(iq2,jq0);
+    qq21             = _mm256_mul_ps(iq2,jq1);
+    qq22             = _mm256_mul_ps(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm256_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm256_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm256_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+    j_coord_offsetE = 0;
+    j_coord_offsetF = 0;
+    j_coord_offsetG = 0;
+    j_coord_offsetH = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                    &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm256_setzero_ps();
+        fiy0             = _mm256_setzero_ps();
+        fiz0             = _mm256_setzero_ps();
+        fix1             = _mm256_setzero_ps();
+        fiy1             = _mm256_setzero_ps();
+        fiz1             = _mm256_setzero_ps();
+        fix2             = _mm256_setzero_ps();
+        fiy2             = _mm256_setzero_ps();
+        fiz2             = _mm256_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm256_setzero_ps();
+        vvdwsum          = _mm256_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            jnrE             = jjnr[jidx+4];
+            jnrF             = jjnr[jidx+5];
+            jnrG             = jjnr[jidx+6];
+            jnrH             = jjnr[jidx+7];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+            dx01             = _mm256_sub_ps(ix0,jx1);
+            dy01             = _mm256_sub_ps(iy0,jy1);
+            dz01             = _mm256_sub_ps(iz0,jz1);
+            dx02             = _mm256_sub_ps(ix0,jx2);
+            dy02             = _mm256_sub_ps(iy0,jy2);
+            dz02             = _mm256_sub_ps(iz0,jz2);
+            dx10             = _mm256_sub_ps(ix1,jx0);
+            dy10             = _mm256_sub_ps(iy1,jy0);
+            dz10             = _mm256_sub_ps(iz1,jz0);
+            dx11             = _mm256_sub_ps(ix1,jx1);
+            dy11             = _mm256_sub_ps(iy1,jy1);
+            dz11             = _mm256_sub_ps(iz1,jz1);
+            dx12             = _mm256_sub_ps(ix1,jx2);
+            dy12             = _mm256_sub_ps(iy1,jy2);
+            dz12             = _mm256_sub_ps(iz1,jz2);
+            dx20             = _mm256_sub_ps(ix2,jx0);
+            dy20             = _mm256_sub_ps(iy2,jy0);
+            dz20             = _mm256_sub_ps(iz2,jz0);
+            dx21             = _mm256_sub_ps(ix2,jx1);
+            dy21             = _mm256_sub_ps(iy2,jy1);
+            dz21             = _mm256_sub_ps(iz2,jz1);
+            dx22             = _mm256_sub_ps(ix2,jx2);
+            dy22             = _mm256_sub_ps(iy2,jy2);
+            dz22             = _mm256_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm256_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm256_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm256_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm256_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm256_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm256_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm256_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm256_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm256_setzero_ps();
+            fjy0             = _mm256_setzero_ps();
+            fjz0             = _mm256_setzero_ps();
+            fjx1             = _mm256_setzero_ps();
+            fjy1             = _mm256_setzero_ps();
+            fjz1             = _mm256_setzero_ps();
+            fjx2             = _mm256_setzero_ps();
+            fjy2             = _mm256_setzero_ps();
+            fjz2             = _mm256_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq00);
+            rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq00,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv00,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq00,velec);
+            
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_ps(_mm256_sub_ps(c6_00,_mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
+           vvdw             = _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12 , _mm256_mul_ps(c12_00,_mm256_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm256_mul_ps( _mm256_sub_ps(vvdw6,_mm256_add_ps(_mm256_mul_ps(c6_00,sh_vdw_invrcut6),_mm256_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,_mm256_sub_ps(vvdw6,_mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+            vvdw             = _mm256_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm256_add_ps(felec,fvdw);
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm256_mul_ps(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq01);
+            rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq01,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv01,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq01,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx01);
+            ty               = _mm256_mul_ps(fscal,dy01);
+            tz               = _mm256_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm256_mul_ps(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq02);
+            rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq02,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv02,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq02,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx02);
+            ty               = _mm256_mul_ps(fscal,dy02);
+            tz               = _mm256_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm256_mul_ps(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq10);
+            rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq10,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv10,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq10,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx10);
+            ty               = _mm256_mul_ps(fscal,dy10);
+            tz               = _mm256_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm256_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq11);
+            rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq11,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv11,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq11,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx11);
+            ty               = _mm256_mul_ps(fscal,dy11);
+            tz               = _mm256_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm256_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq12);
+            rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq12,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv12,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq12,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx12);
+            ty               = _mm256_mul_ps(fscal,dy12);
+            tz               = _mm256_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm256_mul_ps(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq20);
+            rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq20,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv20,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq20,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx20);
+            ty               = _mm256_mul_ps(fscal,dy20);
+            tz               = _mm256_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm256_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq21);
+            rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq21,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv21,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq21,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx21);
+            ty               = _mm256_mul_ps(fscal,dy21);
+            tz               = _mm256_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm256_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq22);
+            rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq22,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv22,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq22,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx22);
+            ty               = _mm256_mul_ps(fscal,dy22);
+            tz               = _mm256_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            }
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            fjptrE             = f+j_coord_offsetE;
+            fjptrF             = f+j_coord_offsetF;
+            fjptrG             = f+j_coord_offsetG;
+            fjptrH             = f+j_coord_offsetH;
+
+            gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
+                                                      fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 1017 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            jnrlistE         = jjnr[jidx+4];
+            jnrlistF         = jjnr[jidx+5];
+            jnrlistG         = jjnr[jidx+6];
+            jnrlistH         = jjnr[jidx+7];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
+                                            gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
+                                            
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
+            jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
+            jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
+            jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+            dx01             = _mm256_sub_ps(ix0,jx1);
+            dy01             = _mm256_sub_ps(iy0,jy1);
+            dz01             = _mm256_sub_ps(iz0,jz1);
+            dx02             = _mm256_sub_ps(ix0,jx2);
+            dy02             = _mm256_sub_ps(iy0,jy2);
+            dz02             = _mm256_sub_ps(iz0,jz2);
+            dx10             = _mm256_sub_ps(ix1,jx0);
+            dy10             = _mm256_sub_ps(iy1,jy0);
+            dz10             = _mm256_sub_ps(iz1,jz0);
+            dx11             = _mm256_sub_ps(ix1,jx1);
+            dy11             = _mm256_sub_ps(iy1,jy1);
+            dz11             = _mm256_sub_ps(iz1,jz1);
+            dx12             = _mm256_sub_ps(ix1,jx2);
+            dy12             = _mm256_sub_ps(iy1,jy2);
+            dz12             = _mm256_sub_ps(iz1,jz2);
+            dx20             = _mm256_sub_ps(ix2,jx0);
+            dy20             = _mm256_sub_ps(iy2,jy0);
+            dz20             = _mm256_sub_ps(iz2,jz0);
+            dx21             = _mm256_sub_ps(ix2,jx1);
+            dy21             = _mm256_sub_ps(iy2,jy1);
+            dz21             = _mm256_sub_ps(iz2,jz1);
+            dx22             = _mm256_sub_ps(ix2,jx2);
+            dy22             = _mm256_sub_ps(iy2,jy2);
+            dz22             = _mm256_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm256_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm256_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm256_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm256_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm256_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm256_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm256_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm256_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm256_setzero_ps();
+            fjy0             = _mm256_setzero_ps();
+            fjz0             = _mm256_setzero_ps();
+            fjx1             = _mm256_setzero_ps();
+            fjy1             = _mm256_setzero_ps();
+            fjz1             = _mm256_setzero_ps();
+            fjx2             = _mm256_setzero_ps();
+            fjy2             = _mm256_setzero_ps();
+            fjz2             = _mm256_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+            r00              = _mm256_andnot_ps(dummy_mask,r00);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq00);
+            rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq00,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv00,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq00,velec);
+            
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_ps(_mm256_sub_ps(c6_00,_mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
+           vvdw             = _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12 , _mm256_mul_ps(c12_00,_mm256_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm256_mul_ps( _mm256_sub_ps(vvdw6,_mm256_add_ps(_mm256_mul_ps(c6_00,sh_vdw_invrcut6),_mm256_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,_mm256_sub_ps(vvdw6,_mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+            vvdw             = _mm256_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm256_add_ps(felec,fvdw);
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm256_mul_ps(rsq01,rinv01);
+            r01              = _mm256_andnot_ps(dummy_mask,r01);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq01);
+            rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq01,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv01,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq01,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx01);
+            ty               = _mm256_mul_ps(fscal,dy01);
+            tz               = _mm256_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm256_mul_ps(rsq02,rinv02);
+            r02              = _mm256_andnot_ps(dummy_mask,r02);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq02);
+            rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq02,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv02,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq02,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx02);
+            ty               = _mm256_mul_ps(fscal,dy02);
+            tz               = _mm256_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm256_mul_ps(rsq10,rinv10);
+            r10              = _mm256_andnot_ps(dummy_mask,r10);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq10);
+            rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq10,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv10,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq10,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx10);
+            ty               = _mm256_mul_ps(fscal,dy10);
+            tz               = _mm256_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm256_mul_ps(rsq11,rinv11);
+            r11              = _mm256_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq11);
+            rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq11,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv11,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq11,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx11);
+            ty               = _mm256_mul_ps(fscal,dy11);
+            tz               = _mm256_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm256_mul_ps(rsq12,rinv12);
+            r12              = _mm256_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq12);
+            rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq12,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv12,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq12,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx12);
+            ty               = _mm256_mul_ps(fscal,dy12);
+            tz               = _mm256_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm256_mul_ps(rsq20,rinv20);
+            r20              = _mm256_andnot_ps(dummy_mask,r20);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq20);
+            rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq20,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv20,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq20,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx20);
+            ty               = _mm256_mul_ps(fscal,dy20);
+            tz               = _mm256_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm256_mul_ps(rsq21,rinv21);
+            r21              = _mm256_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq21);
+            rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq21,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv21,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq21,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx21);
+            ty               = _mm256_mul_ps(fscal,dy21);
+            tz               = _mm256_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm256_mul_ps(rsq22,rinv22);
+            r22              = _mm256_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq22);
+            rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq22,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv22,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq22,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx22);
+            ty               = _mm256_mul_ps(fscal,dy22);
+            tz               = _mm256_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            }
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
+            fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
+            fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
+            fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
+
+            gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
+                                                      fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 1026 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*1026);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_avx_256_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_avx_256_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrE,jnrF,jnrG,jnrH;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
+    real             scratch[4*DIM];
+    __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    real *           vdwioffsetptr1;
+    real *           vdwgridioffsetptr1;
+    __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    real *           vdwioffsetptr2;
+    real *           vdwgridioffsetptr2;
+    __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
+    __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
+    __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
+    __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m256           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
+    __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
+    __m256           c6grid_00;
+    __m256           c6grid_01;
+    __m256           c6grid_02;
+    __m256           c6grid_10;
+    __m256           c6grid_11;
+    __m256           c6grid_12;
+    __m256           c6grid_20;
+    __m256           c6grid_21;
+    __m256           c6grid_22;
+    real             *vdwgridparam;
+    __m256           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256           one_half  = _mm256_set1_ps(0.5);
+    __m256           minus_one = _mm256_set1_ps(-1.0);
+    __m256i          ewitab;
+    __m128i          ewitab_lo,ewitab_hi;
+    __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m256           dummy_mask,cutoff_mask;
+    __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
+    __m256           one     = _mm256_set1_ps(1.0);
+    __m256           two     = _mm256_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm256_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_ps(minus_one,_mm256_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
+    beta             = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
+    beta2            = _mm256_mul_ps(beta,beta);
+    beta3            = _mm256_mul_ps(beta,beta2);
+
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
+    iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
+    iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
+    vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+    vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm256_set1_ps(charge[inr+0]);
+    jq1              = _mm256_set1_ps(charge[inr+1]);
+    jq2              = _mm256_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm256_mul_ps(iq0,jq0);
+    c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
+    c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
+    c6grid_00        = _mm256_set1_ps(vdwgridioffsetptr0[vdwjidx0A]);
+    qq01             = _mm256_mul_ps(iq0,jq1);
+    qq02             = _mm256_mul_ps(iq0,jq2);
+    qq10             = _mm256_mul_ps(iq1,jq0);
+    qq11             = _mm256_mul_ps(iq1,jq1);
+    qq12             = _mm256_mul_ps(iq1,jq2);
+    qq20             = _mm256_mul_ps(iq2,jq0);
+    qq21             = _mm256_mul_ps(iq2,jq1);
+    qq22             = _mm256_mul_ps(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm256_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm256_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm256_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+    j_coord_offsetE = 0;
+    j_coord_offsetF = 0;
+    j_coord_offsetG = 0;
+    j_coord_offsetH = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                    &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm256_setzero_ps();
+        fiy0             = _mm256_setzero_ps();
+        fiz0             = _mm256_setzero_ps();
+        fix1             = _mm256_setzero_ps();
+        fiy1             = _mm256_setzero_ps();
+        fiz1             = _mm256_setzero_ps();
+        fix2             = _mm256_setzero_ps();
+        fiy2             = _mm256_setzero_ps();
+        fiz2             = _mm256_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            jnrE             = jjnr[jidx+4];
+            jnrF             = jjnr[jidx+5];
+            jnrG             = jjnr[jidx+6];
+            jnrH             = jjnr[jidx+7];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+            dx01             = _mm256_sub_ps(ix0,jx1);
+            dy01             = _mm256_sub_ps(iy0,jy1);
+            dz01             = _mm256_sub_ps(iz0,jz1);
+            dx02             = _mm256_sub_ps(ix0,jx2);
+            dy02             = _mm256_sub_ps(iy0,jy2);
+            dz02             = _mm256_sub_ps(iz0,jz2);
+            dx10             = _mm256_sub_ps(ix1,jx0);
+            dy10             = _mm256_sub_ps(iy1,jy0);
+            dz10             = _mm256_sub_ps(iz1,jz0);
+            dx11             = _mm256_sub_ps(ix1,jx1);
+            dy11             = _mm256_sub_ps(iy1,jy1);
+            dz11             = _mm256_sub_ps(iz1,jz1);
+            dx12             = _mm256_sub_ps(ix1,jx2);
+            dy12             = _mm256_sub_ps(iy1,jy2);
+            dz12             = _mm256_sub_ps(iz1,jz2);
+            dx20             = _mm256_sub_ps(ix2,jx0);
+            dy20             = _mm256_sub_ps(iy2,jy0);
+            dz20             = _mm256_sub_ps(iz2,jz0);
+            dx21             = _mm256_sub_ps(ix2,jx1);
+            dy21             = _mm256_sub_ps(iy2,jy1);
+            dz21             = _mm256_sub_ps(iz2,jz1);
+            dx22             = _mm256_sub_ps(ix2,jx2);
+            dy22             = _mm256_sub_ps(iy2,jy2);
+            dz22             = _mm256_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm256_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm256_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm256_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm256_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm256_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm256_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm256_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm256_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm256_setzero_ps();
+            fjy0             = _mm256_setzero_ps();
+            fjz0             = _mm256_setzero_ps();
+            fjx1             = _mm256_setzero_ps();
+            fjy1             = _mm256_setzero_ps();
+            fjz1             = _mm256_setzero_ps();
+            fjx2             = _mm256_setzero_ps();
+            fjy2             = _mm256_setzero_ps();
+            fjz2             = _mm256_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq00);
+            rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq00,felec);
+            
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),_mm256_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = _mm256_add_ps(felec,fvdw);
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm256_mul_ps(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq01);
+            rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq01,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx01);
+            ty               = _mm256_mul_ps(fscal,dy01);
+            tz               = _mm256_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm256_mul_ps(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq02);
+            rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq02,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx02);
+            ty               = _mm256_mul_ps(fscal,dy02);
+            tz               = _mm256_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm256_mul_ps(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq10);
+            rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq10,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx10);
+            ty               = _mm256_mul_ps(fscal,dy10);
+            tz               = _mm256_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm256_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq11);
+            rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq11,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx11);
+            ty               = _mm256_mul_ps(fscal,dy11);
+            tz               = _mm256_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm256_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq12);
+            rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq12,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx12);
+            ty               = _mm256_mul_ps(fscal,dy12);
+            tz               = _mm256_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm256_mul_ps(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq20);
+            rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq20,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx20);
+            ty               = _mm256_mul_ps(fscal,dy20);
+            tz               = _mm256_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm256_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq21);
+            rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq21,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx21);
+            ty               = _mm256_mul_ps(fscal,dy21);
+            tz               = _mm256_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm256_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq22);
+            rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq22,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx22);
+            ty               = _mm256_mul_ps(fscal,dy22);
+            tz               = _mm256_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            }
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            fjptrE             = f+j_coord_offsetE;
+            fjptrF             = f+j_coord_offsetF;
+            fjptrG             = f+j_coord_offsetG;
+            fjptrH             = f+j_coord_offsetH;
+
+            gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
+                                                      fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 554 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            jnrlistE         = jjnr[jidx+4];
+            jnrlistF         = jjnr[jidx+5];
+            jnrlistG         = jjnr[jidx+6];
+            jnrlistH         = jjnr[jidx+7];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
+                                            gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
+                                            
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
+            jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
+            jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
+            jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+            dx01             = _mm256_sub_ps(ix0,jx1);
+            dy01             = _mm256_sub_ps(iy0,jy1);
+            dz01             = _mm256_sub_ps(iz0,jz1);
+            dx02             = _mm256_sub_ps(ix0,jx2);
+            dy02             = _mm256_sub_ps(iy0,jy2);
+            dz02             = _mm256_sub_ps(iz0,jz2);
+            dx10             = _mm256_sub_ps(ix1,jx0);
+            dy10             = _mm256_sub_ps(iy1,jy0);
+            dz10             = _mm256_sub_ps(iz1,jz0);
+            dx11             = _mm256_sub_ps(ix1,jx1);
+            dy11             = _mm256_sub_ps(iy1,jy1);
+            dz11             = _mm256_sub_ps(iz1,jz1);
+            dx12             = _mm256_sub_ps(ix1,jx2);
+            dy12             = _mm256_sub_ps(iy1,jy2);
+            dz12             = _mm256_sub_ps(iz1,jz2);
+            dx20             = _mm256_sub_ps(ix2,jx0);
+            dy20             = _mm256_sub_ps(iy2,jy0);
+            dz20             = _mm256_sub_ps(iz2,jz0);
+            dx21             = _mm256_sub_ps(ix2,jx1);
+            dy21             = _mm256_sub_ps(iy2,jy1);
+            dz21             = _mm256_sub_ps(iz2,jz1);
+            dx22             = _mm256_sub_ps(ix2,jx2);
+            dy22             = _mm256_sub_ps(iy2,jy2);
+            dz22             = _mm256_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm256_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm256_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm256_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm256_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm256_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm256_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm256_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm256_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm256_setzero_ps();
+            fjy0             = _mm256_setzero_ps();
+            fjz0             = _mm256_setzero_ps();
+            fjx1             = _mm256_setzero_ps();
+            fjy1             = _mm256_setzero_ps();
+            fjz1             = _mm256_setzero_ps();
+            fjx2             = _mm256_setzero_ps();
+            fjy2             = _mm256_setzero_ps();
+            fjz2             = _mm256_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+            r00              = _mm256_andnot_ps(dummy_mask,r00);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq00);
+            rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq00,felec);
+            
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),_mm256_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = _mm256_add_ps(felec,fvdw);
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm256_mul_ps(rsq01,rinv01);
+            r01              = _mm256_andnot_ps(dummy_mask,r01);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq01);
+            rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq01,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx01);
+            ty               = _mm256_mul_ps(fscal,dy01);
+            tz               = _mm256_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm256_mul_ps(rsq02,rinv02);
+            r02              = _mm256_andnot_ps(dummy_mask,r02);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq02);
+            rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq02,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx02);
+            ty               = _mm256_mul_ps(fscal,dy02);
+            tz               = _mm256_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm256_mul_ps(rsq10,rinv10);
+            r10              = _mm256_andnot_ps(dummy_mask,r10);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq10);
+            rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq10,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx10);
+            ty               = _mm256_mul_ps(fscal,dy10);
+            tz               = _mm256_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm256_mul_ps(rsq11,rinv11);
+            r11              = _mm256_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq11);
+            rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq11,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx11);
+            ty               = _mm256_mul_ps(fscal,dy11);
+            tz               = _mm256_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm256_mul_ps(rsq12,rinv12);
+            r12              = _mm256_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq12);
+            rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq12,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx12);
+            ty               = _mm256_mul_ps(fscal,dy12);
+            tz               = _mm256_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm256_mul_ps(rsq20,rinv20);
+            r20              = _mm256_andnot_ps(dummy_mask,r20);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq20);
+            rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq20,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx20);
+            ty               = _mm256_mul_ps(fscal,dy20);
+            tz               = _mm256_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm256_mul_ps(rsq21,rinv21);
+            r21              = _mm256_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq21);
+            rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq21,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx21);
+            ty               = _mm256_mul_ps(fscal,dy21);
+            tz               = _mm256_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm256_mul_ps(rsq22,rinv22);
+            r22              = _mm256_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq22);
+            rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq22,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx22);
+            ty               = _mm256_mul_ps(fscal,dy22);
+            tz               = _mm256_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            }
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
+            fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
+            fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
+            fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
+
+            gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
+                                                      fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 563 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*563);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_avx_256_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_avx_256_single.c
new file mode 100644 (file)
index 0000000..ae0dfd6
--- /dev/null
@@ -0,0 +1,1700 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_256_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_256_single.h"
+#include "kernelutil_x86_avx_256_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_avx_256_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_avx_256_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrE,jnrF,jnrG,jnrH;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
+    real             scratch[4*DIM];
+    __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    real *           vdwioffsetptr1;
+    real *           vdwgridioffsetptr1;
+    __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    real *           vdwioffsetptr2;
+    real *           vdwgridioffsetptr2;
+    __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    real *           vdwioffsetptr3;
+    real *           vdwgridioffsetptr3;
+    __m256           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
+    __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m256           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m256           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
+    __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
+    __m256           c6grid_00;
+    __m256           c6grid_10;
+    __m256           c6grid_20;
+    __m256           c6grid_30;
+    real             *vdwgridparam;
+    __m256           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256           one_half  = _mm256_set1_ps(0.5);
+    __m256           minus_one = _mm256_set1_ps(-1.0);
+    __m256i          ewitab;
+    __m128i          ewitab_lo,ewitab_hi;
+    __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m256           dummy_mask,cutoff_mask;
+    __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
+    __m256           one     = _mm256_set1_ps(1.0);
+    __m256           two     = _mm256_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm256_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_ps(minus_one,_mm256_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
+    beta             = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
+    beta2            = _mm256_mul_ps(beta,beta);
+    beta3            = _mm256_mul_ps(beta,beta2);
+
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
+    iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
+    iq3              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
+    vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+    vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm256_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm256_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm256_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+    j_coord_offsetE = 0;
+    j_coord_offsetF = 0;
+    j_coord_offsetG = 0;
+    j_coord_offsetH = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                    &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm256_setzero_ps();
+        fiy0             = _mm256_setzero_ps();
+        fiz0             = _mm256_setzero_ps();
+        fix1             = _mm256_setzero_ps();
+        fiy1             = _mm256_setzero_ps();
+        fiz1             = _mm256_setzero_ps();
+        fix2             = _mm256_setzero_ps();
+        fiy2             = _mm256_setzero_ps();
+        fiz2             = _mm256_setzero_ps();
+        fix3             = _mm256_setzero_ps();
+        fiy3             = _mm256_setzero_ps();
+        fiz3             = _mm256_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm256_setzero_ps();
+        vvdwsum          = _mm256_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            jnrE             = jjnr[jidx+4];
+            jnrF             = jjnr[jidx+5];
+            jnrG             = jjnr[jidx+6];
+            jnrH             = jjnr[jidx+7];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+            dx10             = _mm256_sub_ps(ix1,jx0);
+            dy10             = _mm256_sub_ps(iy1,jy0);
+            dz10             = _mm256_sub_ps(iz1,jz0);
+            dx20             = _mm256_sub_ps(ix2,jx0);
+            dy20             = _mm256_sub_ps(iy2,jy0);
+            dz20             = _mm256_sub_ps(iz2,jz0);
+            dx30             = _mm256_sub_ps(ix3,jx0);
+            dy30             = _mm256_sub_ps(iy3,jy0);
+            dz30             = _mm256_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm256_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm256_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm256_invsqrt_ps(rsq30);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm256_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0,
+                                                                 charge+jnrE+0,charge+jnrF+0,
+                                                                 charge+jnrG+0,charge+jnrH+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+            vdwjidx0E        = 2*vdwtype[jnrE+0];
+            vdwjidx0F        = 2*vdwtype[jnrF+0];
+            vdwjidx0G        = 2*vdwtype[jnrG+0];
+            vdwjidx0H        = 2*vdwtype[jnrH+0];
+
+            fjx0             = _mm256_setzero_ps();
+            fjy0             = _mm256_setzero_ps();
+            fjz0             = _mm256_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            vdwioffsetptr0+vdwjidx0E,
+                                            vdwioffsetptr0+vdwjidx0F,
+                                            vdwioffsetptr0+vdwjidx0G,
+                                            vdwioffsetptr0+vdwjidx0H,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_8real_swizzle_ps(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D,
+                                                                  vdwgridioffsetptr0+vdwjidx0E,
+                                                                  vdwgridioffsetptr0+vdwjidx0F,
+                                                                  vdwgridioffsetptr0+vdwjidx0G,
+                                                                  vdwgridioffsetptr0+vdwjidx0H);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_ps(_mm256_sub_ps(c6_00,_mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
+           vvdw             = _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12 , _mm256_mul_ps(c12_00,_mm256_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm256_mul_ps( _mm256_sub_ps(vvdw6,_mm256_add_ps(_mm256_mul_ps(c6_00,sh_vdw_invrcut6),_mm256_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,_mm256_sub_ps(vvdw6,_mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm256_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm256_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm256_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq10);
+            rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq10,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv10,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq10,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx10);
+            ty               = _mm256_mul_ps(fscal,dy10);
+            tz               = _mm256_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm256_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm256_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq20);
+            rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq20,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv20,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq20,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx20);
+            ty               = _mm256_mul_ps(fscal,dy20);
+            tz               = _mm256_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm256_mul_ps(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm256_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq30);
+            rinv3            = _mm256_mul_ps(rinvsq30,rinv30);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq30,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv30,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq30,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq30,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx30);
+            ty               = _mm256_mul_ps(fscal,dy30);
+            tz               = _mm256_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_ps(fix3,tx);
+            fiy3             = _mm256_add_ps(fiy3,ty);
+            fiz3             = _mm256_add_ps(fiz3,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            fjptrE             = f+j_coord_offsetE;
+            fjptrF             = f+j_coord_offsetF;
+            fjptrG             = f+j_coord_offsetG;
+            fjptrH             = f+j_coord_offsetH;
+
+            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 392 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            jnrlistE         = jjnr[jidx+4];
+            jnrlistF         = jjnr[jidx+5];
+            jnrlistG         = jjnr[jidx+6];
+            jnrlistH         = jjnr[jidx+7];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
+                                            gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
+                                            
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
+            jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
+            jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
+            jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+            dx10             = _mm256_sub_ps(ix1,jx0);
+            dy10             = _mm256_sub_ps(iy1,jy0);
+            dz10             = _mm256_sub_ps(iz1,jz0);
+            dx20             = _mm256_sub_ps(ix2,jx0);
+            dy20             = _mm256_sub_ps(iy2,jy0);
+            dz20             = _mm256_sub_ps(iz2,jz0);
+            dx30             = _mm256_sub_ps(ix3,jx0);
+            dy30             = _mm256_sub_ps(iy3,jy0);
+            dz30             = _mm256_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm256_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm256_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm256_invsqrt_ps(rsq30);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm256_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0,
+                                                                 charge+jnrE+0,charge+jnrF+0,
+                                                                 charge+jnrG+0,charge+jnrH+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+            vdwjidx0E        = 2*vdwtype[jnrE+0];
+            vdwjidx0F        = 2*vdwtype[jnrF+0];
+            vdwjidx0G        = 2*vdwtype[jnrG+0];
+            vdwjidx0H        = 2*vdwtype[jnrH+0];
+
+            fjx0             = _mm256_setzero_ps();
+            fjy0             = _mm256_setzero_ps();
+            fjz0             = _mm256_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+            r00              = _mm256_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            vdwioffsetptr0+vdwjidx0E,
+                                            vdwioffsetptr0+vdwjidx0F,
+                                            vdwioffsetptr0+vdwjidx0G,
+                                            vdwioffsetptr0+vdwjidx0H,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_8real_swizzle_ps(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D,
+                                                                  vdwgridioffsetptr0+vdwjidx0E,
+                                                                  vdwgridioffsetptr0+vdwjidx0F,
+                                                                  vdwgridioffsetptr0+vdwjidx0G,
+                                                                  vdwgridioffsetptr0+vdwjidx0H);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_ps(_mm256_sub_ps(c6_00,_mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
+           vvdw             = _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12 , _mm256_mul_ps(c12_00,_mm256_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm256_mul_ps( _mm256_sub_ps(vvdw6,_mm256_add_ps(_mm256_mul_ps(c6_00,sh_vdw_invrcut6),_mm256_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,_mm256_sub_ps(vvdw6,_mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm256_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm256_mul_ps(rsq10,rinv10);
+            r10              = _mm256_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm256_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq10);
+            rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq10,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv10,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq10,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx10);
+            ty               = _mm256_mul_ps(fscal,dy10);
+            tz               = _mm256_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm256_mul_ps(rsq20,rinv20);
+            r20              = _mm256_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm256_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq20);
+            rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq20,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv20,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq20,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx20);
+            ty               = _mm256_mul_ps(fscal,dy20);
+            tz               = _mm256_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm256_mul_ps(rsq30,rinv30);
+            r30              = _mm256_andnot_ps(dummy_mask,r30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm256_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq30);
+            rinv3            = _mm256_mul_ps(rinvsq30,rinv30);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq30,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv30,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq30,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq30,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx30);
+            ty               = _mm256_mul_ps(fscal,dy30);
+            tz               = _mm256_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_ps(fix3,tx);
+            fiy3             = _mm256_add_ps(fiy3,ty);
+            fiz3             = _mm256_add_ps(fiz3,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
+            fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
+            fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
+            fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
+
+            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 396 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*396);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_avx_256_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_avx_256_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrE,jnrF,jnrG,jnrH;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
+    real             scratch[4*DIM];
+    __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    real *           vdwioffsetptr1;
+    real *           vdwgridioffsetptr1;
+    __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    real *           vdwioffsetptr2;
+    real *           vdwgridioffsetptr2;
+    __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    real *           vdwioffsetptr3;
+    real *           vdwgridioffsetptr3;
+    __m256           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
+    __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m256           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m256           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
+    __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
+    __m256           c6grid_00;
+    __m256           c6grid_10;
+    __m256           c6grid_20;
+    __m256           c6grid_30;
+    real             *vdwgridparam;
+    __m256           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256           one_half  = _mm256_set1_ps(0.5);
+    __m256           minus_one = _mm256_set1_ps(-1.0);
+    __m256i          ewitab;
+    __m128i          ewitab_lo,ewitab_hi;
+    __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m256           dummy_mask,cutoff_mask;
+    __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
+    __m256           one     = _mm256_set1_ps(1.0);
+    __m256           two     = _mm256_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm256_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_ps(minus_one,_mm256_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
+    beta             = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
+    beta2            = _mm256_mul_ps(beta,beta);
+    beta3            = _mm256_mul_ps(beta,beta2);
+
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
+    iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
+    iq3              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
+    vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+    vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm256_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm256_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm256_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+    j_coord_offsetE = 0;
+    j_coord_offsetF = 0;
+    j_coord_offsetG = 0;
+    j_coord_offsetH = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                    &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm256_setzero_ps();
+        fiy0             = _mm256_setzero_ps();
+        fiz0             = _mm256_setzero_ps();
+        fix1             = _mm256_setzero_ps();
+        fiy1             = _mm256_setzero_ps();
+        fiz1             = _mm256_setzero_ps();
+        fix2             = _mm256_setzero_ps();
+        fiy2             = _mm256_setzero_ps();
+        fiz2             = _mm256_setzero_ps();
+        fix3             = _mm256_setzero_ps();
+        fiy3             = _mm256_setzero_ps();
+        fiz3             = _mm256_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            jnrE             = jjnr[jidx+4];
+            jnrF             = jjnr[jidx+5];
+            jnrG             = jjnr[jidx+6];
+            jnrH             = jjnr[jidx+7];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+            dx10             = _mm256_sub_ps(ix1,jx0);
+            dy10             = _mm256_sub_ps(iy1,jy0);
+            dz10             = _mm256_sub_ps(iz1,jz0);
+            dx20             = _mm256_sub_ps(ix2,jx0);
+            dy20             = _mm256_sub_ps(iy2,jy0);
+            dz20             = _mm256_sub_ps(iz2,jz0);
+            dx30             = _mm256_sub_ps(ix3,jx0);
+            dy30             = _mm256_sub_ps(iy3,jy0);
+            dz30             = _mm256_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm256_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm256_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm256_invsqrt_ps(rsq30);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm256_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0,
+                                                                 charge+jnrE+0,charge+jnrF+0,
+                                                                 charge+jnrG+0,charge+jnrH+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+            vdwjidx0E        = 2*vdwtype[jnrE+0];
+            vdwjidx0F        = 2*vdwtype[jnrF+0];
+            vdwjidx0G        = 2*vdwtype[jnrG+0];
+            vdwjidx0H        = 2*vdwtype[jnrH+0];
+
+            fjx0             = _mm256_setzero_ps();
+            fjy0             = _mm256_setzero_ps();
+            fjz0             = _mm256_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            vdwioffsetptr0+vdwjidx0E,
+                                            vdwioffsetptr0+vdwjidx0F,
+                                            vdwioffsetptr0+vdwjidx0G,
+                                            vdwioffsetptr0+vdwjidx0H,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_8real_swizzle_ps(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D,
+                                                                  vdwgridioffsetptr0+vdwjidx0E,
+                                                                  vdwgridioffsetptr0+vdwjidx0F,
+                                                                  vdwgridioffsetptr0+vdwjidx0G,
+                                                                  vdwgridioffsetptr0+vdwjidx0H);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),_mm256_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = fvdw;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm256_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm256_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq10);
+            rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq10,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx10);
+            ty               = _mm256_mul_ps(fscal,dy10);
+            tz               = _mm256_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm256_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm256_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq20);
+            rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq20,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx20);
+            ty               = _mm256_mul_ps(fscal,dy20);
+            tz               = _mm256_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm256_mul_ps(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm256_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq30);
+            rinv3            = _mm256_mul_ps(rinvsq30,rinv30);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq30,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq30,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx30);
+            ty               = _mm256_mul_ps(fscal,dy30);
+            tz               = _mm256_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_ps(fix3,tx);
+            fiy3             = _mm256_add_ps(fiy3,ty);
+            fiz3             = _mm256_add_ps(fiz3,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            fjptrE             = f+j_coord_offsetE;
+            fjptrF             = f+j_coord_offsetF;
+            fjptrG             = f+j_coord_offsetG;
+            fjptrH             = f+j_coord_offsetH;
+
+            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 229 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            jnrlistE         = jjnr[jidx+4];
+            jnrlistF         = jjnr[jidx+5];
+            jnrlistG         = jjnr[jidx+6];
+            jnrlistH         = jjnr[jidx+7];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
+                                            gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
+                                            
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
+            jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
+            jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
+            jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+            dx10             = _mm256_sub_ps(ix1,jx0);
+            dy10             = _mm256_sub_ps(iy1,jy0);
+            dz10             = _mm256_sub_ps(iz1,jz0);
+            dx20             = _mm256_sub_ps(ix2,jx0);
+            dy20             = _mm256_sub_ps(iy2,jy0);
+            dz20             = _mm256_sub_ps(iz2,jz0);
+            dx30             = _mm256_sub_ps(ix3,jx0);
+            dy30             = _mm256_sub_ps(iy3,jy0);
+            dz30             = _mm256_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm256_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm256_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm256_invsqrt_ps(rsq30);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm256_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0,
+                                                                 charge+jnrE+0,charge+jnrF+0,
+                                                                 charge+jnrG+0,charge+jnrH+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+            vdwjidx0E        = 2*vdwtype[jnrE+0];
+            vdwjidx0F        = 2*vdwtype[jnrF+0];
+            vdwjidx0G        = 2*vdwtype[jnrG+0];
+            vdwjidx0H        = 2*vdwtype[jnrH+0];
+
+            fjx0             = _mm256_setzero_ps();
+            fjy0             = _mm256_setzero_ps();
+            fjz0             = _mm256_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+            r00              = _mm256_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            vdwioffsetptr0+vdwjidx0E,
+                                            vdwioffsetptr0+vdwjidx0F,
+                                            vdwioffsetptr0+vdwjidx0G,
+                                            vdwioffsetptr0+vdwjidx0H,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_8real_swizzle_ps(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D,
+                                                                  vdwgridioffsetptr0+vdwjidx0E,
+                                                                  vdwgridioffsetptr0+vdwjidx0F,
+                                                                  vdwgridioffsetptr0+vdwjidx0G,
+                                                                  vdwgridioffsetptr0+vdwjidx0H);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),_mm256_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = fvdw;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm256_mul_ps(rsq10,rinv10);
+            r10              = _mm256_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm256_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq10);
+            rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq10,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx10);
+            ty               = _mm256_mul_ps(fscal,dy10);
+            tz               = _mm256_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm256_mul_ps(rsq20,rinv20);
+            r20              = _mm256_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm256_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq20);
+            rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq20,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx20);
+            ty               = _mm256_mul_ps(fscal,dy20);
+            tz               = _mm256_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm256_mul_ps(rsq30,rinv30);
+            r30              = _mm256_andnot_ps(dummy_mask,r30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm256_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq30);
+            rinv3            = _mm256_mul_ps(rinvsq30,rinv30);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq30,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq30,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx30);
+            ty               = _mm256_mul_ps(fscal,dy30);
+            tz               = _mm256_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_ps(fix3,tx);
+            fiy3             = _mm256_add_ps(fiy3,ty);
+            fiz3             = _mm256_add_ps(fiz3,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
+            fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
+            fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
+            fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
+
+            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 233 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*233);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_avx_256_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_avx_256_single.c
new file mode 100644 (file)
index 0000000..5da9a1d
--- /dev/null
@@ -0,0 +1,2888 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_256_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_256_single.h"
+#include "kernelutil_x86_avx_256_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_avx_256_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_avx_256_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrE,jnrF,jnrG,jnrH;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
+    real             scratch[4*DIM];
+    __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    real *           vdwioffsetptr1;
+    real *           vdwgridioffsetptr1;
+    __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    real *           vdwioffsetptr2;
+    real *           vdwgridioffsetptr2;
+    __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    real *           vdwioffsetptr3;
+    real *           vdwgridioffsetptr3;
+    __m256           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
+    __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
+    __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
+    __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
+    __m256           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m256           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m256           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m256           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m256           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m256           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m256           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
+    __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
+    __m256           c6grid_00;
+    __m256           c6grid_11;
+    __m256           c6grid_12;
+    __m256           c6grid_13;
+    __m256           c6grid_21;
+    __m256           c6grid_22;
+    __m256           c6grid_23;
+    __m256           c6grid_31;
+    __m256           c6grid_32;
+    __m256           c6grid_33;
+    real             *vdwgridparam;
+    __m256           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256           one_half  = _mm256_set1_ps(0.5);
+    __m256           minus_one = _mm256_set1_ps(-1.0);
+    __m256i          ewitab;
+    __m128i          ewitab_lo,ewitab_hi;
+    __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m256           dummy_mask,cutoff_mask;
+    __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
+    __m256           one     = _mm256_set1_ps(1.0);
+    __m256           two     = _mm256_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm256_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_ps(minus_one,_mm256_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
+    beta             = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
+    beta2            = _mm256_mul_ps(beta,beta);
+    beta3            = _mm256_mul_ps(beta,beta2);
+
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
+    iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
+    iq3              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
+    vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+    vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm256_set1_ps(charge[inr+1]);
+    jq2              = _mm256_set1_ps(charge[inr+2]);
+    jq3              = _mm256_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
+    c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
+    c6grid_00        = _mm256_set1_ps(vdwgridioffsetptr0[vdwjidx0A]);
+    qq11             = _mm256_mul_ps(iq1,jq1);
+    qq12             = _mm256_mul_ps(iq1,jq2);
+    qq13             = _mm256_mul_ps(iq1,jq3);
+    qq21             = _mm256_mul_ps(iq2,jq1);
+    qq22             = _mm256_mul_ps(iq2,jq2);
+    qq23             = _mm256_mul_ps(iq2,jq3);
+    qq31             = _mm256_mul_ps(iq3,jq1);
+    qq32             = _mm256_mul_ps(iq3,jq2);
+    qq33             = _mm256_mul_ps(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm256_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm256_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm256_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+    j_coord_offsetE = 0;
+    j_coord_offsetF = 0;
+    j_coord_offsetG = 0;
+    j_coord_offsetH = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                    &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm256_setzero_ps();
+        fiy0             = _mm256_setzero_ps();
+        fiz0             = _mm256_setzero_ps();
+        fix1             = _mm256_setzero_ps();
+        fiy1             = _mm256_setzero_ps();
+        fiz1             = _mm256_setzero_ps();
+        fix2             = _mm256_setzero_ps();
+        fiy2             = _mm256_setzero_ps();
+        fiz2             = _mm256_setzero_ps();
+        fix3             = _mm256_setzero_ps();
+        fiy3             = _mm256_setzero_ps();
+        fiz3             = _mm256_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm256_setzero_ps();
+        vvdwsum          = _mm256_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            jnrE             = jjnr[jidx+4];
+            jnrF             = jjnr[jidx+5];
+            jnrG             = jjnr[jidx+6];
+            jnrH             = jjnr[jidx+7];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                                 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                                 &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+            dx11             = _mm256_sub_ps(ix1,jx1);
+            dy11             = _mm256_sub_ps(iy1,jy1);
+            dz11             = _mm256_sub_ps(iz1,jz1);
+            dx12             = _mm256_sub_ps(ix1,jx2);
+            dy12             = _mm256_sub_ps(iy1,jy2);
+            dz12             = _mm256_sub_ps(iz1,jz2);
+            dx13             = _mm256_sub_ps(ix1,jx3);
+            dy13             = _mm256_sub_ps(iy1,jy3);
+            dz13             = _mm256_sub_ps(iz1,jz3);
+            dx21             = _mm256_sub_ps(ix2,jx1);
+            dy21             = _mm256_sub_ps(iy2,jy1);
+            dz21             = _mm256_sub_ps(iz2,jz1);
+            dx22             = _mm256_sub_ps(ix2,jx2);
+            dy22             = _mm256_sub_ps(iy2,jy2);
+            dz22             = _mm256_sub_ps(iz2,jz2);
+            dx23             = _mm256_sub_ps(ix2,jx3);
+            dy23             = _mm256_sub_ps(iy2,jy3);
+            dz23             = _mm256_sub_ps(iz2,jz3);
+            dx31             = _mm256_sub_ps(ix3,jx1);
+            dy31             = _mm256_sub_ps(iy3,jy1);
+            dz31             = _mm256_sub_ps(iz3,jz1);
+            dx32             = _mm256_sub_ps(ix3,jx2);
+            dy32             = _mm256_sub_ps(iy3,jy2);
+            dz32             = _mm256_sub_ps(iz3,jz2);
+            dx33             = _mm256_sub_ps(ix3,jx3);
+            dy33             = _mm256_sub_ps(iy3,jy3);
+            dz33             = _mm256_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm256_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm256_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm256_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm256_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm256_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm256_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm256_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm256_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm256_invsqrt_ps(rsq33);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+            rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm256_setzero_ps();
+            fjy0             = _mm256_setzero_ps();
+            fjz0             = _mm256_setzero_ps();
+            fjx1             = _mm256_setzero_ps();
+            fjy1             = _mm256_setzero_ps();
+            fjz1             = _mm256_setzero_ps();
+            fjx2             = _mm256_setzero_ps();
+            fjy2             = _mm256_setzero_ps();
+            fjz2             = _mm256_setzero_ps();
+            fjx3             = _mm256_setzero_ps();
+            fjy3             = _mm256_setzero_ps();
+            fjz3             = _mm256_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_ps(_mm256_sub_ps(c6_00,_mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
+           vvdw             = _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12 , _mm256_mul_ps(c12_00,_mm256_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm256_mul_ps( _mm256_sub_ps(vvdw6,_mm256_add_ps(_mm256_mul_ps(c6_00,sh_vdw_invrcut6),_mm256_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,_mm256_sub_ps(vvdw6,_mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm256_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm256_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq11);
+            rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq11,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv11,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq11,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx11);
+            ty               = _mm256_mul_ps(fscal,dy11);
+            tz               = _mm256_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm256_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq12);
+            rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq12,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv12,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq12,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx12);
+            ty               = _mm256_mul_ps(fscal,dy12);
+            tz               = _mm256_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm256_mul_ps(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq13);
+            rinv3            = _mm256_mul_ps(rinvsq13,rinv13);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq13,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv13,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq13,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx13);
+            ty               = _mm256_mul_ps(fscal,dy13);
+            tz               = _mm256_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx3             = _mm256_add_ps(fjx3,tx);
+            fjy3             = _mm256_add_ps(fjy3,ty);
+            fjz3             = _mm256_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm256_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq21);
+            rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq21,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv21,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq21,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx21);
+            ty               = _mm256_mul_ps(fscal,dy21);
+            tz               = _mm256_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm256_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq22);
+            rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq22,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv22,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq22,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx22);
+            ty               = _mm256_mul_ps(fscal,dy22);
+            tz               = _mm256_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm256_mul_ps(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq23);
+            rinv3            = _mm256_mul_ps(rinvsq23,rinv23);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq23,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv23,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq23,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx23);
+            ty               = _mm256_mul_ps(fscal,dy23);
+            tz               = _mm256_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx3             = _mm256_add_ps(fjx3,tx);
+            fjy3             = _mm256_add_ps(fjy3,ty);
+            fjz3             = _mm256_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm256_mul_ps(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq31);
+            rinv3            = _mm256_mul_ps(rinvsq31,rinv31);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq31,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv31,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq31,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx31);
+            ty               = _mm256_mul_ps(fscal,dy31);
+            tz               = _mm256_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_ps(fix3,tx);
+            fiy3             = _mm256_add_ps(fiy3,ty);
+            fiz3             = _mm256_add_ps(fiz3,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm256_mul_ps(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq32);
+            rinv3            = _mm256_mul_ps(rinvsq32,rinv32);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq32,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv32,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq32,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx32);
+            ty               = _mm256_mul_ps(fscal,dy32);
+            tz               = _mm256_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_ps(fix3,tx);
+            fiy3             = _mm256_add_ps(fiy3,ty);
+            fiz3             = _mm256_add_ps(fiz3,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm256_mul_ps(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq33);
+            rinv3            = _mm256_mul_ps(rinvsq33,rinv33);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq33,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv33,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq33,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx33);
+            ty               = _mm256_mul_ps(fscal,dy33);
+            tz               = _mm256_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_ps(fix3,tx);
+            fiy3             = _mm256_add_ps(fiy3,ty);
+            fiz3             = _mm256_add_ps(fiz3,tz);
+
+            fjx3             = _mm256_add_ps(fjx3,tx);
+            fjy3             = _mm256_add_ps(fjy3,ty);
+            fjz3             = _mm256_add_ps(fjz3,tz);
+
+            }
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            fjptrE             = f+j_coord_offsetE;
+            fjptrF             = f+j_coord_offsetF;
+            fjptrG             = f+j_coord_offsetG;
+            fjptrH             = f+j_coord_offsetH;
+
+            gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
+                                                      fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                      fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 1046 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            jnrlistE         = jjnr[jidx+4];
+            jnrlistF         = jjnr[jidx+5];
+            jnrlistG         = jjnr[jidx+6];
+            jnrlistH         = jjnr[jidx+7];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
+                                            gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
+                                            
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
+            jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
+            jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
+            jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                                 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                                 &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+            dx11             = _mm256_sub_ps(ix1,jx1);
+            dy11             = _mm256_sub_ps(iy1,jy1);
+            dz11             = _mm256_sub_ps(iz1,jz1);
+            dx12             = _mm256_sub_ps(ix1,jx2);
+            dy12             = _mm256_sub_ps(iy1,jy2);
+            dz12             = _mm256_sub_ps(iz1,jz2);
+            dx13             = _mm256_sub_ps(ix1,jx3);
+            dy13             = _mm256_sub_ps(iy1,jy3);
+            dz13             = _mm256_sub_ps(iz1,jz3);
+            dx21             = _mm256_sub_ps(ix2,jx1);
+            dy21             = _mm256_sub_ps(iy2,jy1);
+            dz21             = _mm256_sub_ps(iz2,jz1);
+            dx22             = _mm256_sub_ps(ix2,jx2);
+            dy22             = _mm256_sub_ps(iy2,jy2);
+            dz22             = _mm256_sub_ps(iz2,jz2);
+            dx23             = _mm256_sub_ps(ix2,jx3);
+            dy23             = _mm256_sub_ps(iy2,jy3);
+            dz23             = _mm256_sub_ps(iz2,jz3);
+            dx31             = _mm256_sub_ps(ix3,jx1);
+            dy31             = _mm256_sub_ps(iy3,jy1);
+            dz31             = _mm256_sub_ps(iz3,jz1);
+            dx32             = _mm256_sub_ps(ix3,jx2);
+            dy32             = _mm256_sub_ps(iy3,jy2);
+            dz32             = _mm256_sub_ps(iz3,jz2);
+            dx33             = _mm256_sub_ps(ix3,jx3);
+            dy33             = _mm256_sub_ps(iy3,jy3);
+            dz33             = _mm256_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm256_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm256_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm256_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm256_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm256_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm256_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm256_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm256_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm256_invsqrt_ps(rsq33);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+            rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm256_setzero_ps();
+            fjy0             = _mm256_setzero_ps();
+            fjz0             = _mm256_setzero_ps();
+            fjx1             = _mm256_setzero_ps();
+            fjy1             = _mm256_setzero_ps();
+            fjz1             = _mm256_setzero_ps();
+            fjx2             = _mm256_setzero_ps();
+            fjy2             = _mm256_setzero_ps();
+            fjz2             = _mm256_setzero_ps();
+            fjx3             = _mm256_setzero_ps();
+            fjy3             = _mm256_setzero_ps();
+            fjz3             = _mm256_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+            r00              = _mm256_andnot_ps(dummy_mask,r00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_ps(_mm256_sub_ps(c6_00,_mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
+           vvdw             = _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12 , _mm256_mul_ps(c12_00,_mm256_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm256_mul_ps( _mm256_sub_ps(vvdw6,_mm256_add_ps(_mm256_mul_ps(c6_00,sh_vdw_invrcut6),_mm256_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,_mm256_sub_ps(vvdw6,_mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm256_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm256_mul_ps(rsq11,rinv11);
+            r11              = _mm256_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq11);
+            rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq11,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv11,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq11,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx11);
+            ty               = _mm256_mul_ps(fscal,dy11);
+            tz               = _mm256_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm256_mul_ps(rsq12,rinv12);
+            r12              = _mm256_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq12);
+            rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq12,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv12,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq12,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx12);
+            ty               = _mm256_mul_ps(fscal,dy12);
+            tz               = _mm256_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm256_mul_ps(rsq13,rinv13);
+            r13              = _mm256_andnot_ps(dummy_mask,r13);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq13);
+            rinv3            = _mm256_mul_ps(rinvsq13,rinv13);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq13,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv13,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq13,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx13);
+            ty               = _mm256_mul_ps(fscal,dy13);
+            tz               = _mm256_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx3             = _mm256_add_ps(fjx3,tx);
+            fjy3             = _mm256_add_ps(fjy3,ty);
+            fjz3             = _mm256_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm256_mul_ps(rsq21,rinv21);
+            r21              = _mm256_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq21);
+            rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq21,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv21,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq21,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx21);
+            ty               = _mm256_mul_ps(fscal,dy21);
+            tz               = _mm256_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm256_mul_ps(rsq22,rinv22);
+            r22              = _mm256_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq22);
+            rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq22,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv22,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq22,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx22);
+            ty               = _mm256_mul_ps(fscal,dy22);
+            tz               = _mm256_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm256_mul_ps(rsq23,rinv23);
+            r23              = _mm256_andnot_ps(dummy_mask,r23);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq23);
+            rinv3            = _mm256_mul_ps(rinvsq23,rinv23);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq23,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv23,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq23,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx23);
+            ty               = _mm256_mul_ps(fscal,dy23);
+            tz               = _mm256_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx3             = _mm256_add_ps(fjx3,tx);
+            fjy3             = _mm256_add_ps(fjy3,ty);
+            fjz3             = _mm256_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm256_mul_ps(rsq31,rinv31);
+            r31              = _mm256_andnot_ps(dummy_mask,r31);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq31);
+            rinv3            = _mm256_mul_ps(rinvsq31,rinv31);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq31,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv31,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq31,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx31);
+            ty               = _mm256_mul_ps(fscal,dy31);
+            tz               = _mm256_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_ps(fix3,tx);
+            fiy3             = _mm256_add_ps(fiy3,ty);
+            fiz3             = _mm256_add_ps(fiz3,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm256_mul_ps(rsq32,rinv32);
+            r32              = _mm256_andnot_ps(dummy_mask,r32);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq32);
+            rinv3            = _mm256_mul_ps(rinvsq32,rinv32);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq32,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv32,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq32,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx32);
+            ty               = _mm256_mul_ps(fscal,dy32);
+            tz               = _mm256_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_ps(fix3,tx);
+            fiy3             = _mm256_add_ps(fiy3,ty);
+            fiz3             = _mm256_add_ps(fiz3,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm256_mul_ps(rsq33,rinv33);
+            r33              = _mm256_andnot_ps(dummy_mask,r33);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq33);
+            rinv3            = _mm256_mul_ps(rinvsq33,rinv33);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq33,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(_mm256_sub_ps(rinv33,sh_ewald),pmecorrV);
+            velec            = _mm256_mul_ps(qq33,velec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_and_ps(velec,cutoff_mask);
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx33);
+            ty               = _mm256_mul_ps(fscal,dy33);
+            tz               = _mm256_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_ps(fix3,tx);
+            fiy3             = _mm256_add_ps(fiy3,ty);
+            fiz3             = _mm256_add_ps(fiz3,tz);
+
+            fjx3             = _mm256_add_ps(fjx3,tx);
+            fjy3             = _mm256_add_ps(fjy3,ty);
+            fjz3             = _mm256_add_ps(fjz3,tz);
+
+            }
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
+            fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
+            fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
+            fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
+
+            gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
+                                                      fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                      fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 1056 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*1056);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_avx_256_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_avx_256_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrE,jnrF,jnrG,jnrH;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
+    real             scratch[4*DIM];
+    __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    real *           vdwioffsetptr1;
+    real *           vdwgridioffsetptr1;
+    __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    real *           vdwioffsetptr2;
+    real *           vdwgridioffsetptr2;
+    __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    real *           vdwioffsetptr3;
+    real *           vdwgridioffsetptr3;
+    __m256           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
+    __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
+    __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
+    __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
+    __m256           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m256           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m256           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m256           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m256           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m256           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m256           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
+    __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
+    __m256           c6grid_00;
+    __m256           c6grid_11;
+    __m256           c6grid_12;
+    __m256           c6grid_13;
+    __m256           c6grid_21;
+    __m256           c6grid_22;
+    __m256           c6grid_23;
+    __m256           c6grid_31;
+    __m256           c6grid_32;
+    __m256           c6grid_33;
+    real             *vdwgridparam;
+    __m256           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256           one_half  = _mm256_set1_ps(0.5);
+    __m256           minus_one = _mm256_set1_ps(-1.0);
+    __m256i          ewitab;
+    __m128i          ewitab_lo,ewitab_hi;
+    __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m256           dummy_mask,cutoff_mask;
+    __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
+    __m256           one     = _mm256_set1_ps(1.0);
+    __m256           two     = _mm256_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm256_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_ps(minus_one,_mm256_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
+    beta             = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
+    beta2            = _mm256_mul_ps(beta,beta);
+    beta3            = _mm256_mul_ps(beta,beta2);
+
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
+    iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
+    iq3              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
+    vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+    vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm256_set1_ps(charge[inr+1]);
+    jq2              = _mm256_set1_ps(charge[inr+2]);
+    jq3              = _mm256_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
+    c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
+    c6grid_00        = _mm256_set1_ps(vdwgridioffsetptr0[vdwjidx0A]);
+    qq11             = _mm256_mul_ps(iq1,jq1);
+    qq12             = _mm256_mul_ps(iq1,jq2);
+    qq13             = _mm256_mul_ps(iq1,jq3);
+    qq21             = _mm256_mul_ps(iq2,jq1);
+    qq22             = _mm256_mul_ps(iq2,jq2);
+    qq23             = _mm256_mul_ps(iq2,jq3);
+    qq31             = _mm256_mul_ps(iq3,jq1);
+    qq32             = _mm256_mul_ps(iq3,jq2);
+    qq33             = _mm256_mul_ps(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm256_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm256_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm256_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+    j_coord_offsetE = 0;
+    j_coord_offsetF = 0;
+    j_coord_offsetG = 0;
+    j_coord_offsetH = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                    &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm256_setzero_ps();
+        fiy0             = _mm256_setzero_ps();
+        fiz0             = _mm256_setzero_ps();
+        fix1             = _mm256_setzero_ps();
+        fiy1             = _mm256_setzero_ps();
+        fiz1             = _mm256_setzero_ps();
+        fix2             = _mm256_setzero_ps();
+        fiy2             = _mm256_setzero_ps();
+        fiz2             = _mm256_setzero_ps();
+        fix3             = _mm256_setzero_ps();
+        fiy3             = _mm256_setzero_ps();
+        fiz3             = _mm256_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            jnrE             = jjnr[jidx+4];
+            jnrF             = jjnr[jidx+5];
+            jnrG             = jjnr[jidx+6];
+            jnrH             = jjnr[jidx+7];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                                 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                                 &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+            dx11             = _mm256_sub_ps(ix1,jx1);
+            dy11             = _mm256_sub_ps(iy1,jy1);
+            dz11             = _mm256_sub_ps(iz1,jz1);
+            dx12             = _mm256_sub_ps(ix1,jx2);
+            dy12             = _mm256_sub_ps(iy1,jy2);
+            dz12             = _mm256_sub_ps(iz1,jz2);
+            dx13             = _mm256_sub_ps(ix1,jx3);
+            dy13             = _mm256_sub_ps(iy1,jy3);
+            dz13             = _mm256_sub_ps(iz1,jz3);
+            dx21             = _mm256_sub_ps(ix2,jx1);
+            dy21             = _mm256_sub_ps(iy2,jy1);
+            dz21             = _mm256_sub_ps(iz2,jz1);
+            dx22             = _mm256_sub_ps(ix2,jx2);
+            dy22             = _mm256_sub_ps(iy2,jy2);
+            dz22             = _mm256_sub_ps(iz2,jz2);
+            dx23             = _mm256_sub_ps(ix2,jx3);
+            dy23             = _mm256_sub_ps(iy2,jy3);
+            dz23             = _mm256_sub_ps(iz2,jz3);
+            dx31             = _mm256_sub_ps(ix3,jx1);
+            dy31             = _mm256_sub_ps(iy3,jy1);
+            dz31             = _mm256_sub_ps(iz3,jz1);
+            dx32             = _mm256_sub_ps(ix3,jx2);
+            dy32             = _mm256_sub_ps(iy3,jy2);
+            dz32             = _mm256_sub_ps(iz3,jz2);
+            dx33             = _mm256_sub_ps(ix3,jx3);
+            dy33             = _mm256_sub_ps(iy3,jy3);
+            dz33             = _mm256_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm256_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm256_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm256_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm256_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm256_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm256_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm256_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm256_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm256_invsqrt_ps(rsq33);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+            rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm256_setzero_ps();
+            fjy0             = _mm256_setzero_ps();
+            fjz0             = _mm256_setzero_ps();
+            fjx1             = _mm256_setzero_ps();
+            fjy1             = _mm256_setzero_ps();
+            fjz1             = _mm256_setzero_ps();
+            fjx2             = _mm256_setzero_ps();
+            fjy2             = _mm256_setzero_ps();
+            fjz2             = _mm256_setzero_ps();
+            fjx3             = _mm256_setzero_ps();
+            fjy3             = _mm256_setzero_ps();
+            fjz3             = _mm256_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),_mm256_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = fvdw;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm256_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq11);
+            rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq11,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx11);
+            ty               = _mm256_mul_ps(fscal,dy11);
+            tz               = _mm256_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm256_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq12);
+            rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq12,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx12);
+            ty               = _mm256_mul_ps(fscal,dy12);
+            tz               = _mm256_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm256_mul_ps(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq13);
+            rinv3            = _mm256_mul_ps(rinvsq13,rinv13);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq13,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx13);
+            ty               = _mm256_mul_ps(fscal,dy13);
+            tz               = _mm256_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx3             = _mm256_add_ps(fjx3,tx);
+            fjy3             = _mm256_add_ps(fjy3,ty);
+            fjz3             = _mm256_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm256_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq21);
+            rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq21,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx21);
+            ty               = _mm256_mul_ps(fscal,dy21);
+            tz               = _mm256_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm256_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq22);
+            rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq22,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx22);
+            ty               = _mm256_mul_ps(fscal,dy22);
+            tz               = _mm256_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm256_mul_ps(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq23);
+            rinv3            = _mm256_mul_ps(rinvsq23,rinv23);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq23,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx23);
+            ty               = _mm256_mul_ps(fscal,dy23);
+            tz               = _mm256_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx3             = _mm256_add_ps(fjx3,tx);
+            fjy3             = _mm256_add_ps(fjy3,ty);
+            fjz3             = _mm256_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm256_mul_ps(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq31);
+            rinv3            = _mm256_mul_ps(rinvsq31,rinv31);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq31,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx31);
+            ty               = _mm256_mul_ps(fscal,dy31);
+            tz               = _mm256_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_ps(fix3,tx);
+            fiy3             = _mm256_add_ps(fiy3,ty);
+            fiz3             = _mm256_add_ps(fiz3,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm256_mul_ps(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq32);
+            rinv3            = _mm256_mul_ps(rinvsq32,rinv32);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq32,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx32);
+            ty               = _mm256_mul_ps(fscal,dy32);
+            tz               = _mm256_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_ps(fix3,tx);
+            fiy3             = _mm256_add_ps(fiy3,ty);
+            fiz3             = _mm256_add_ps(fiz3,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm256_mul_ps(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq33);
+            rinv3            = _mm256_mul_ps(rinvsq33,rinv33);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq33,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx33);
+            ty               = _mm256_mul_ps(fscal,dy33);
+            tz               = _mm256_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_ps(fix3,tx);
+            fiy3             = _mm256_add_ps(fiy3,ty);
+            fiz3             = _mm256_add_ps(fiz3,tz);
+
+            fjx3             = _mm256_add_ps(fjx3,tx);
+            fjy3             = _mm256_add_ps(fjy3,ty);
+            fjz3             = _mm256_add_ps(fjz3,tz);
+
+            }
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            fjptrE             = f+j_coord_offsetE;
+            fjptrF             = f+j_coord_offsetF;
+            fjptrG             = f+j_coord_offsetG;
+            fjptrH             = f+j_coord_offsetH;
+
+            gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
+                                                      fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                      fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 583 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            jnrlistE         = jjnr[jidx+4];
+            jnrlistF         = jjnr[jidx+5];
+            jnrlistG         = jjnr[jidx+6];
+            jnrlistH         = jjnr[jidx+7];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
+                                            gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
+                                            
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
+            jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
+            jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
+            jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                                 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                                 &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+            dx11             = _mm256_sub_ps(ix1,jx1);
+            dy11             = _mm256_sub_ps(iy1,jy1);
+            dz11             = _mm256_sub_ps(iz1,jz1);
+            dx12             = _mm256_sub_ps(ix1,jx2);
+            dy12             = _mm256_sub_ps(iy1,jy2);
+            dz12             = _mm256_sub_ps(iz1,jz2);
+            dx13             = _mm256_sub_ps(ix1,jx3);
+            dy13             = _mm256_sub_ps(iy1,jy3);
+            dz13             = _mm256_sub_ps(iz1,jz3);
+            dx21             = _mm256_sub_ps(ix2,jx1);
+            dy21             = _mm256_sub_ps(iy2,jy1);
+            dz21             = _mm256_sub_ps(iz2,jz1);
+            dx22             = _mm256_sub_ps(ix2,jx2);
+            dy22             = _mm256_sub_ps(iy2,jy2);
+            dz22             = _mm256_sub_ps(iz2,jz2);
+            dx23             = _mm256_sub_ps(ix2,jx3);
+            dy23             = _mm256_sub_ps(iy2,jy3);
+            dz23             = _mm256_sub_ps(iz2,jz3);
+            dx31             = _mm256_sub_ps(ix3,jx1);
+            dy31             = _mm256_sub_ps(iy3,jy1);
+            dz31             = _mm256_sub_ps(iz3,jz1);
+            dx32             = _mm256_sub_ps(ix3,jx2);
+            dy32             = _mm256_sub_ps(iy3,jy2);
+            dz32             = _mm256_sub_ps(iz3,jz2);
+            dx33             = _mm256_sub_ps(ix3,jx3);
+            dy33             = _mm256_sub_ps(iy3,jy3);
+            dz33             = _mm256_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm256_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm256_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm256_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm256_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm256_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm256_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm256_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm256_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm256_invsqrt_ps(rsq33);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+            rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm256_setzero_ps();
+            fjy0             = _mm256_setzero_ps();
+            fjz0             = _mm256_setzero_ps();
+            fjx1             = _mm256_setzero_ps();
+            fjy1             = _mm256_setzero_ps();
+            fjz1             = _mm256_setzero_ps();
+            fjx2             = _mm256_setzero_ps();
+            fjy2             = _mm256_setzero_ps();
+            fjz2             = _mm256_setzero_ps();
+            fjx3             = _mm256_setzero_ps();
+            fjy3             = _mm256_setzero_ps();
+            fjz3             = _mm256_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+            r00              = _mm256_andnot_ps(dummy_mask,r00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),_mm256_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = fvdw;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm256_mul_ps(rsq11,rinv11);
+            r11              = _mm256_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq11);
+            rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq11,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx11);
+            ty               = _mm256_mul_ps(fscal,dy11);
+            tz               = _mm256_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm256_mul_ps(rsq12,rinv12);
+            r12              = _mm256_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq12);
+            rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq12,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx12);
+            ty               = _mm256_mul_ps(fscal,dy12);
+            tz               = _mm256_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm256_mul_ps(rsq13,rinv13);
+            r13              = _mm256_andnot_ps(dummy_mask,r13);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq13);
+            rinv3            = _mm256_mul_ps(rinvsq13,rinv13);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq13,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx13);
+            ty               = _mm256_mul_ps(fscal,dy13);
+            tz               = _mm256_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx3             = _mm256_add_ps(fjx3,tx);
+            fjy3             = _mm256_add_ps(fjy3,ty);
+            fjz3             = _mm256_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm256_mul_ps(rsq21,rinv21);
+            r21              = _mm256_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq21);
+            rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq21,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx21);
+            ty               = _mm256_mul_ps(fscal,dy21);
+            tz               = _mm256_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm256_mul_ps(rsq22,rinv22);
+            r22              = _mm256_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq22);
+            rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq22,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx22);
+            ty               = _mm256_mul_ps(fscal,dy22);
+            tz               = _mm256_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm256_mul_ps(rsq23,rinv23);
+            r23              = _mm256_andnot_ps(dummy_mask,r23);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq23);
+            rinv3            = _mm256_mul_ps(rinvsq23,rinv23);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq23,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx23);
+            ty               = _mm256_mul_ps(fscal,dy23);
+            tz               = _mm256_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx3             = _mm256_add_ps(fjx3,tx);
+            fjy3             = _mm256_add_ps(fjy3,ty);
+            fjz3             = _mm256_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm256_mul_ps(rsq31,rinv31);
+            r31              = _mm256_andnot_ps(dummy_mask,r31);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq31);
+            rinv3            = _mm256_mul_ps(rinvsq31,rinv31);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq31,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx31);
+            ty               = _mm256_mul_ps(fscal,dy31);
+            tz               = _mm256_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_ps(fix3,tx);
+            fiy3             = _mm256_add_ps(fiy3,ty);
+            fiz3             = _mm256_add_ps(fiz3,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm256_mul_ps(rsq32,rinv32);
+            r32              = _mm256_andnot_ps(dummy_mask,r32);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq32);
+            rinv3            = _mm256_mul_ps(rinvsq32,rinv32);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq32,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx32);
+            ty               = _mm256_mul_ps(fscal,dy32);
+            tz               = _mm256_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_ps(fix3,tx);
+            fiy3             = _mm256_add_ps(fiy3,ty);
+            fiz3             = _mm256_add_ps(fiz3,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm256_mul_ps(rsq33,rinv33);
+            r33              = _mm256_andnot_ps(dummy_mask,r33);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq33);
+            rinv3            = _mm256_mul_ps(rinvsq33,rinv33);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq33,felec);
+            
+            cutoff_mask      = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = felec;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx33);
+            ty               = _mm256_mul_ps(fscal,dy33);
+            tz               = _mm256_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_ps(fix3,tx);
+            fiy3             = _mm256_add_ps(fiy3,ty);
+            fiz3             = _mm256_add_ps(fiz3,tz);
+
+            fjx3             = _mm256_add_ps(fjx3,tx);
+            fjy3             = _mm256_add_ps(fjy3,ty);
+            fjz3             = _mm256_add_ps(fjz3,tz);
+
+            }
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
+            fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
+            fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
+            fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
+
+            gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
+                                                      fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                      fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 593 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*593);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecEw_VdwLJEw_GeomP1P1_avx_256_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecEw_VdwLJEw_GeomP1P1_avx_256_single.c
new file mode 100644 (file)
index 0000000..d98f635
--- /dev/null
@@ -0,0 +1,946 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_256_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_256_single.h"
+#include "kernelutil_x86_avx_256_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_avx_256_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_avx_256_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrE,jnrF,jnrG,jnrH;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
+    real             scratch[4*DIM];
+    __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
+    __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m256           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
+    __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
+    __m256           c6grid_00;
+    real             *vdwgridparam;
+    __m256           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256           one_half  = _mm256_set1_ps(0.5);
+    __m256           minus_one = _mm256_set1_ps(-1.0);
+    __m256i          ewitab;
+    __m128i          ewitab_lo,ewitab_hi;
+    __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m256           dummy_mask,cutoff_mask;
+    __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
+    __m256           one     = _mm256_set1_ps(1.0);
+    __m256           two     = _mm256_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm256_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_ps(minus_one,_mm256_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
+    beta             = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
+    beta2            = _mm256_mul_ps(beta,beta);
+    beta3            = _mm256_mul_ps(beta,beta2);
+
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+    j_coord_offsetE = 0;
+    j_coord_offsetF = 0;
+    j_coord_offsetG = 0;
+    j_coord_offsetH = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm256_setzero_ps();
+        fiy0             = _mm256_setzero_ps();
+        fiz0             = _mm256_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
+        vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+        vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _mm256_setzero_ps();
+        vvdwsum          = _mm256_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            jnrE             = jjnr[jidx+4];
+            jnrF             = jjnr[jidx+5];
+            jnrG             = jjnr[jidx+6];
+            jnrH             = jjnr[jidx+7];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0,
+                                                                 charge+jnrE+0,charge+jnrF+0,
+                                                                 charge+jnrG+0,charge+jnrH+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+            vdwjidx0E        = 2*vdwtype[jnrE+0];
+            vdwjidx0F        = 2*vdwtype[jnrF+0];
+            vdwjidx0G        = 2*vdwtype[jnrG+0];
+            vdwjidx0H        = 2*vdwtype[jnrH+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm256_mul_ps(iq0,jq0);
+            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            vdwioffsetptr0+vdwjidx0E,
+                                            vdwioffsetptr0+vdwjidx0F,
+                                            vdwioffsetptr0+vdwjidx0G,
+                                            vdwioffsetptr0+vdwjidx0H,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_8real_swizzle_ps(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D,
+                                                                  vdwgridioffsetptr0+vdwjidx0E,
+                                                                  vdwgridioffsetptr0+vdwjidx0F,
+                                                                  vdwgridioffsetptr0+vdwjidx0G,
+                                                                  vdwgridioffsetptr0+vdwjidx0H);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq00);
+            rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq00,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv00,pmecorrV);
+            velec            = _mm256_mul_ps(qq00,velec);
+            
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_ps(_mm256_sub_ps(c6_00,_mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm256_sub_ps(_mm256_mul_ps(vvdw12,one_twelfth),_mm256_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,_mm256_sub_ps(vvdw6,_mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_ps(velecsum,velec);
+            vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm256_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            fjptrE             = f+j_coord_offsetE;
+            fjptrF             = f+j_coord_offsetF;
+            fjptrG             = f+j_coord_offsetG;
+            fjptrH             = f+j_coord_offsetH;
+            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
+
+            /* Inner loop uses 112 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            jnrlistE         = jjnr[jidx+4];
+            jnrlistF         = jjnr[jidx+5];
+            jnrlistG         = jjnr[jidx+6];
+            jnrlistH         = jjnr[jidx+7];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
+                                            gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
+                                            
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
+            jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
+            jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
+            jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0,
+                                                                 charge+jnrE+0,charge+jnrF+0,
+                                                                 charge+jnrG+0,charge+jnrH+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+            vdwjidx0E        = 2*vdwtype[jnrE+0];
+            vdwjidx0F        = 2*vdwtype[jnrF+0];
+            vdwjidx0G        = 2*vdwtype[jnrG+0];
+            vdwjidx0H        = 2*vdwtype[jnrH+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+            r00              = _mm256_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm256_mul_ps(iq0,jq0);
+            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            vdwioffsetptr0+vdwjidx0E,
+                                            vdwioffsetptr0+vdwjidx0F,
+                                            vdwioffsetptr0+vdwjidx0G,
+                                            vdwioffsetptr0+vdwjidx0H,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_8real_swizzle_ps(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D,
+                                                                  vdwgridioffsetptr0+vdwjidx0E,
+                                                                  vdwgridioffsetptr0+vdwjidx0F,
+                                                                  vdwgridioffsetptr0+vdwjidx0G,
+                                                                  vdwgridioffsetptr0+vdwjidx0H);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq00);
+            rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq00,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv00,pmecorrV);
+            velec            = _mm256_mul_ps(qq00,velec);
+            
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_ps(_mm256_sub_ps(c6_00,_mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm256_sub_ps(_mm256_mul_ps(vvdw12,one_twelfth),_mm256_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,_mm256_sub_ps(vvdw6,_mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+            vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm256_add_ps(felec,fvdw);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
+            fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
+            fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
+            fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
+            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
+
+            /* Inner loop uses 113 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*113);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_avx_256_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_avx_256_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrE,jnrF,jnrG,jnrH;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
+    real             scratch[4*DIM];
+    __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
+    __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m256           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
+    __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
+    __m256           c6grid_00;
+    real             *vdwgridparam;
+    __m256           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256           one_half  = _mm256_set1_ps(0.5);
+    __m256           minus_one = _mm256_set1_ps(-1.0);
+    __m256i          ewitab;
+    __m128i          ewitab_lo,ewitab_hi;
+    __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m256           dummy_mask,cutoff_mask;
+    __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
+    __m256           one     = _mm256_set1_ps(1.0);
+    __m256           two     = _mm256_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm256_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_ps(minus_one,_mm256_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
+    beta             = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
+    beta2            = _mm256_mul_ps(beta,beta);
+    beta3            = _mm256_mul_ps(beta,beta2);
+
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+    j_coord_offsetE = 0;
+    j_coord_offsetF = 0;
+    j_coord_offsetG = 0;
+    j_coord_offsetH = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm256_setzero_ps();
+        fiy0             = _mm256_setzero_ps();
+        fiz0             = _mm256_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
+        vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+        vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            jnrE             = jjnr[jidx+4];
+            jnrF             = jjnr[jidx+5];
+            jnrG             = jjnr[jidx+6];
+            jnrH             = jjnr[jidx+7];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0,
+                                                                 charge+jnrE+0,charge+jnrF+0,
+                                                                 charge+jnrG+0,charge+jnrH+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+            vdwjidx0E        = 2*vdwtype[jnrE+0];
+            vdwjidx0F        = 2*vdwtype[jnrF+0];
+            vdwjidx0G        = 2*vdwtype[jnrG+0];
+            vdwjidx0H        = 2*vdwtype[jnrH+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm256_mul_ps(iq0,jq0);
+            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            vdwioffsetptr0+vdwjidx0E,
+                                            vdwioffsetptr0+vdwjidx0F,
+                                            vdwioffsetptr0+vdwjidx0G,
+                                            vdwioffsetptr0+vdwjidx0H,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_8real_swizzle_ps(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D,
+                                                                  vdwgridioffsetptr0+vdwjidx0E,
+                                                                  vdwgridioffsetptr0+vdwjidx0F,
+                                                                  vdwgridioffsetptr0+vdwjidx0G,
+                                                                  vdwgridioffsetptr0+vdwjidx0H);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq00);
+            rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq00,felec);
+            
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),_mm256_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = _mm256_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            fjptrE             = f+j_coord_offsetE;
+            fjptrF             = f+j_coord_offsetF;
+            fjptrG             = f+j_coord_offsetG;
+            fjptrH             = f+j_coord_offsetH;
+            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
+
+            /* Inner loop uses 79 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            jnrlistE         = jjnr[jidx+4];
+            jnrlistF         = jjnr[jidx+5];
+            jnrlistG         = jjnr[jidx+6];
+            jnrlistH         = jjnr[jidx+7];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
+                                            gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
+                                            
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
+            jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
+            jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
+            jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0,
+                                                                 charge+jnrE+0,charge+jnrF+0,
+                                                                 charge+jnrG+0,charge+jnrH+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+            vdwjidx0E        = 2*vdwtype[jnrE+0];
+            vdwjidx0F        = 2*vdwtype[jnrF+0];
+            vdwjidx0G        = 2*vdwtype[jnrG+0];
+            vdwjidx0H        = 2*vdwtype[jnrH+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+            r00              = _mm256_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm256_mul_ps(iq0,jq0);
+            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            vdwioffsetptr0+vdwjidx0E,
+                                            vdwioffsetptr0+vdwjidx0F,
+                                            vdwioffsetptr0+vdwjidx0G,
+                                            vdwioffsetptr0+vdwjidx0H,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_8real_swizzle_ps(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D,
+                                                                  vdwgridioffsetptr0+vdwjidx0E,
+                                                                  vdwgridioffsetptr0+vdwjidx0F,
+                                                                  vdwgridioffsetptr0+vdwjidx0G,
+                                                                  vdwgridioffsetptr0+vdwjidx0H);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq00);
+            rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq00,felec);
+            
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),_mm256_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = _mm256_add_ps(felec,fvdw);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
+            fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
+            fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
+            fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
+            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
+
+            /* Inner loop uses 80 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*80);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecEw_VdwLJEw_GeomW3P1_avx_256_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecEw_VdwLJEw_GeomW3P1_avx_256_single.c
new file mode 100644 (file)
index 0000000..c75eec8
--- /dev/null
@@ -0,0 +1,1384 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_256_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_256_single.h"
+#include "kernelutil_x86_avx_256_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_avx_256_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_avx_256_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrE,jnrF,jnrG,jnrH;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
+    real             scratch[4*DIM];
+    __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    real *           vdwioffsetptr1;
+    real *           vdwgridioffsetptr1;
+    __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    real *           vdwioffsetptr2;
+    real *           vdwgridioffsetptr2;
+    __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
+    __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m256           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
+    __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
+    __m256           c6grid_00;
+    __m256           c6grid_10;
+    __m256           c6grid_20;
+    real             *vdwgridparam;
+    __m256           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256           one_half  = _mm256_set1_ps(0.5);
+    __m256           minus_one = _mm256_set1_ps(-1.0);
+    __m256i          ewitab;
+    __m128i          ewitab_lo,ewitab_hi;
+    __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m256           dummy_mask,cutoff_mask;
+    __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
+    __m256           one     = _mm256_set1_ps(1.0);
+    __m256           two     = _mm256_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm256_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_ps(minus_one,_mm256_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
+    beta             = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
+    beta2            = _mm256_mul_ps(beta,beta);
+    beta3            = _mm256_mul_ps(beta,beta2);
+
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
+    iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
+    iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
+    vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+    vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+    j_coord_offsetE = 0;
+    j_coord_offsetF = 0;
+    j_coord_offsetG = 0;
+    j_coord_offsetH = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                    &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm256_setzero_ps();
+        fiy0             = _mm256_setzero_ps();
+        fiz0             = _mm256_setzero_ps();
+        fix1             = _mm256_setzero_ps();
+        fiy1             = _mm256_setzero_ps();
+        fiz1             = _mm256_setzero_ps();
+        fix2             = _mm256_setzero_ps();
+        fiy2             = _mm256_setzero_ps();
+        fiz2             = _mm256_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm256_setzero_ps();
+        vvdwsum          = _mm256_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            jnrE             = jjnr[jidx+4];
+            jnrF             = jjnr[jidx+5];
+            jnrG             = jjnr[jidx+6];
+            jnrH             = jjnr[jidx+7];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+            dx10             = _mm256_sub_ps(ix1,jx0);
+            dy10             = _mm256_sub_ps(iy1,jy0);
+            dz10             = _mm256_sub_ps(iz1,jz0);
+            dx20             = _mm256_sub_ps(ix2,jx0);
+            dy20             = _mm256_sub_ps(iy2,jy0);
+            dz20             = _mm256_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm256_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm256_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0,
+                                                                 charge+jnrE+0,charge+jnrF+0,
+                                                                 charge+jnrG+0,charge+jnrH+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+            vdwjidx0E        = 2*vdwtype[jnrE+0];
+            vdwjidx0F        = 2*vdwtype[jnrF+0];
+            vdwjidx0G        = 2*vdwtype[jnrG+0];
+            vdwjidx0H        = 2*vdwtype[jnrH+0];
+
+            fjx0             = _mm256_setzero_ps();
+            fjy0             = _mm256_setzero_ps();
+            fjz0             = _mm256_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm256_mul_ps(iq0,jq0);
+            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            vdwioffsetptr0+vdwjidx0E,
+                                            vdwioffsetptr0+vdwjidx0F,
+                                            vdwioffsetptr0+vdwjidx0G,
+                                            vdwioffsetptr0+vdwjidx0H,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_8real_swizzle_ps(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D,
+                                                                  vdwgridioffsetptr0+vdwjidx0E,
+                                                                  vdwgridioffsetptr0+vdwjidx0F,
+                                                                  vdwgridioffsetptr0+vdwjidx0G,
+                                                                  vdwgridioffsetptr0+vdwjidx0H);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq00);
+            rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq00,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv00,pmecorrV);
+            velec            = _mm256_mul_ps(qq00,velec);
+            
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_ps(_mm256_sub_ps(c6_00,_mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm256_sub_ps(_mm256_mul_ps(vvdw12,one_twelfth),_mm256_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,_mm256_sub_ps(vvdw6,_mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_ps(velecsum,velec);
+            vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm256_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm256_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm256_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq10);
+            rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq10,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv10,pmecorrV);
+            velec            = _mm256_mul_ps(qq10,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx10);
+            ty               = _mm256_mul_ps(fscal,dy10);
+            tz               = _mm256_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm256_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm256_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq20);
+            rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq20,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv20,pmecorrV);
+            velec            = _mm256_mul_ps(qq20,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx20);
+            ty               = _mm256_mul_ps(fscal,dy20);
+            tz               = _mm256_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            fjptrE             = f+j_coord_offsetE;
+            fjptrF             = f+j_coord_offsetF;
+            fjptrG             = f+j_coord_offsetG;
+            fjptrH             = f+j_coord_offsetH;
+
+            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 283 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            jnrlistE         = jjnr[jidx+4];
+            jnrlistF         = jjnr[jidx+5];
+            jnrlistG         = jjnr[jidx+6];
+            jnrlistH         = jjnr[jidx+7];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
+                                            gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
+                                            
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
+            jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
+            jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
+            jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+            dx10             = _mm256_sub_ps(ix1,jx0);
+            dy10             = _mm256_sub_ps(iy1,jy0);
+            dz10             = _mm256_sub_ps(iz1,jz0);
+            dx20             = _mm256_sub_ps(ix2,jx0);
+            dy20             = _mm256_sub_ps(iy2,jy0);
+            dz20             = _mm256_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm256_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm256_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0,
+                                                                 charge+jnrE+0,charge+jnrF+0,
+                                                                 charge+jnrG+0,charge+jnrH+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+            vdwjidx0E        = 2*vdwtype[jnrE+0];
+            vdwjidx0F        = 2*vdwtype[jnrF+0];
+            vdwjidx0G        = 2*vdwtype[jnrG+0];
+            vdwjidx0H        = 2*vdwtype[jnrH+0];
+
+            fjx0             = _mm256_setzero_ps();
+            fjy0             = _mm256_setzero_ps();
+            fjz0             = _mm256_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+            r00              = _mm256_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm256_mul_ps(iq0,jq0);
+            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            vdwioffsetptr0+vdwjidx0E,
+                                            vdwioffsetptr0+vdwjidx0F,
+                                            vdwioffsetptr0+vdwjidx0G,
+                                            vdwioffsetptr0+vdwjidx0H,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_8real_swizzle_ps(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D,
+                                                                  vdwgridioffsetptr0+vdwjidx0E,
+                                                                  vdwgridioffsetptr0+vdwjidx0F,
+                                                                  vdwgridioffsetptr0+vdwjidx0G,
+                                                                  vdwgridioffsetptr0+vdwjidx0H);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq00);
+            rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq00,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv00,pmecorrV);
+            velec            = _mm256_mul_ps(qq00,velec);
+            
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_ps(_mm256_sub_ps(c6_00,_mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm256_sub_ps(_mm256_mul_ps(vvdw12,one_twelfth),_mm256_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,_mm256_sub_ps(vvdw6,_mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+            vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm256_add_ps(felec,fvdw);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm256_mul_ps(rsq10,rinv10);
+            r10              = _mm256_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm256_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq10);
+            rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq10,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv10,pmecorrV);
+            velec            = _mm256_mul_ps(qq10,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx10);
+            ty               = _mm256_mul_ps(fscal,dy10);
+            tz               = _mm256_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm256_mul_ps(rsq20,rinv20);
+            r20              = _mm256_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm256_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq20);
+            rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq20,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv20,pmecorrV);
+            velec            = _mm256_mul_ps(qq20,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx20);
+            ty               = _mm256_mul_ps(fscal,dy20);
+            tz               = _mm256_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
+            fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
+            fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
+            fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
+
+            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 286 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*286);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_avx_256_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_avx_256_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrE,jnrF,jnrG,jnrH;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
+    real             scratch[4*DIM];
+    __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    real *           vdwioffsetptr1;
+    real *           vdwgridioffsetptr1;
+    __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    real *           vdwioffsetptr2;
+    real *           vdwgridioffsetptr2;
+    __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
+    __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m256           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
+    __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
+    __m256           c6grid_00;
+    __m256           c6grid_10;
+    __m256           c6grid_20;
+    real             *vdwgridparam;
+    __m256           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256           one_half  = _mm256_set1_ps(0.5);
+    __m256           minus_one = _mm256_set1_ps(-1.0);
+    __m256i          ewitab;
+    __m128i          ewitab_lo,ewitab_hi;
+    __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m256           dummy_mask,cutoff_mask;
+    __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
+    __m256           one     = _mm256_set1_ps(1.0);
+    __m256           two     = _mm256_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm256_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_ps(minus_one,_mm256_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
+    beta             = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
+    beta2            = _mm256_mul_ps(beta,beta);
+    beta3            = _mm256_mul_ps(beta,beta2);
+
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
+    iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
+    iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
+    vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+    vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+    j_coord_offsetE = 0;
+    j_coord_offsetF = 0;
+    j_coord_offsetG = 0;
+    j_coord_offsetH = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                    &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm256_setzero_ps();
+        fiy0             = _mm256_setzero_ps();
+        fiz0             = _mm256_setzero_ps();
+        fix1             = _mm256_setzero_ps();
+        fiy1             = _mm256_setzero_ps();
+        fiz1             = _mm256_setzero_ps();
+        fix2             = _mm256_setzero_ps();
+        fiy2             = _mm256_setzero_ps();
+        fiz2             = _mm256_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            jnrE             = jjnr[jidx+4];
+            jnrF             = jjnr[jidx+5];
+            jnrG             = jjnr[jidx+6];
+            jnrH             = jjnr[jidx+7];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+            dx10             = _mm256_sub_ps(ix1,jx0);
+            dy10             = _mm256_sub_ps(iy1,jy0);
+            dz10             = _mm256_sub_ps(iz1,jz0);
+            dx20             = _mm256_sub_ps(ix2,jx0);
+            dy20             = _mm256_sub_ps(iy2,jy0);
+            dz20             = _mm256_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm256_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm256_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0,
+                                                                 charge+jnrE+0,charge+jnrF+0,
+                                                                 charge+jnrG+0,charge+jnrH+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+            vdwjidx0E        = 2*vdwtype[jnrE+0];
+            vdwjidx0F        = 2*vdwtype[jnrF+0];
+            vdwjidx0G        = 2*vdwtype[jnrG+0];
+            vdwjidx0H        = 2*vdwtype[jnrH+0];
+
+            fjx0             = _mm256_setzero_ps();
+            fjy0             = _mm256_setzero_ps();
+            fjz0             = _mm256_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm256_mul_ps(iq0,jq0);
+            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            vdwioffsetptr0+vdwjidx0E,
+                                            vdwioffsetptr0+vdwjidx0F,
+                                            vdwioffsetptr0+vdwjidx0G,
+                                            vdwioffsetptr0+vdwjidx0H,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_8real_swizzle_ps(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D,
+                                                                  vdwgridioffsetptr0+vdwjidx0E,
+                                                                  vdwgridioffsetptr0+vdwjidx0F,
+                                                                  vdwgridioffsetptr0+vdwjidx0G,
+                                                                  vdwgridioffsetptr0+vdwjidx0H);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq00);
+            rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq00,felec);
+            
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),_mm256_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = _mm256_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm256_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm256_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq10);
+            rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq10,felec);
+            
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx10);
+            ty               = _mm256_mul_ps(fscal,dy10);
+            tz               = _mm256_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm256_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm256_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq20);
+            rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq20,felec);
+            
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx20);
+            ty               = _mm256_mul_ps(fscal,dy20);
+            tz               = _mm256_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            fjptrE             = f+j_coord_offsetE;
+            fjptrF             = f+j_coord_offsetF;
+            fjptrG             = f+j_coord_offsetG;
+            fjptrH             = f+j_coord_offsetH;
+
+            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 194 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            jnrlistE         = jjnr[jidx+4];
+            jnrlistF         = jjnr[jidx+5];
+            jnrlistG         = jjnr[jidx+6];
+            jnrlistH         = jjnr[jidx+7];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
+                                            gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
+                                            
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
+            jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
+            jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
+            jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+            dx10             = _mm256_sub_ps(ix1,jx0);
+            dy10             = _mm256_sub_ps(iy1,jy0);
+            dz10             = _mm256_sub_ps(iz1,jz0);
+            dx20             = _mm256_sub_ps(ix2,jx0);
+            dy20             = _mm256_sub_ps(iy2,jy0);
+            dz20             = _mm256_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm256_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm256_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0,
+                                                                 charge+jnrE+0,charge+jnrF+0,
+                                                                 charge+jnrG+0,charge+jnrH+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+            vdwjidx0E        = 2*vdwtype[jnrE+0];
+            vdwjidx0F        = 2*vdwtype[jnrF+0];
+            vdwjidx0G        = 2*vdwtype[jnrG+0];
+            vdwjidx0H        = 2*vdwtype[jnrH+0];
+
+            fjx0             = _mm256_setzero_ps();
+            fjy0             = _mm256_setzero_ps();
+            fjz0             = _mm256_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+            r00              = _mm256_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm256_mul_ps(iq0,jq0);
+            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            vdwioffsetptr0+vdwjidx0E,
+                                            vdwioffsetptr0+vdwjidx0F,
+                                            vdwioffsetptr0+vdwjidx0G,
+                                            vdwioffsetptr0+vdwjidx0H,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_8real_swizzle_ps(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D,
+                                                                  vdwgridioffsetptr0+vdwjidx0E,
+                                                                  vdwgridioffsetptr0+vdwjidx0F,
+                                                                  vdwgridioffsetptr0+vdwjidx0G,
+                                                                  vdwgridioffsetptr0+vdwjidx0H);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq00);
+            rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq00,felec);
+            
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),_mm256_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = _mm256_add_ps(felec,fvdw);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm256_mul_ps(rsq10,rinv10);
+            r10              = _mm256_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm256_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq10);
+            rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq10,felec);
+            
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx10);
+            ty               = _mm256_mul_ps(fscal,dy10);
+            tz               = _mm256_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm256_mul_ps(rsq20,rinv20);
+            r20              = _mm256_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm256_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq20);
+            rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq20,felec);
+            
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx20);
+            ty               = _mm256_mul_ps(fscal,dy20);
+            tz               = _mm256_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
+            fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
+            fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
+            fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
+
+            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 197 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*197);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecEw_VdwLJEw_GeomW3W3_avx_256_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecEw_VdwLJEw_GeomW3W3_avx_256_single.c
new file mode 100644 (file)
index 0000000..89b9ba6
--- /dev/null
@@ -0,0 +1,2328 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_256_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_256_single.h"
+#include "kernelutil_x86_avx_256_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_avx_256_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_avx_256_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrE,jnrF,jnrG,jnrH;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
+    real             scratch[4*DIM];
+    __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    real *           vdwioffsetptr1;
+    real *           vdwgridioffsetptr1;
+    __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    real *           vdwioffsetptr2;
+    real *           vdwgridioffsetptr2;
+    __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
+    __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
+    __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
+    __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m256           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
+    __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
+    __m256           c6grid_00;
+    __m256           c6grid_01;
+    __m256           c6grid_02;
+    __m256           c6grid_10;
+    __m256           c6grid_11;
+    __m256           c6grid_12;
+    __m256           c6grid_20;
+    __m256           c6grid_21;
+    __m256           c6grid_22;
+    real             *vdwgridparam;
+    __m256           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256           one_half  = _mm256_set1_ps(0.5);
+    __m256           minus_one = _mm256_set1_ps(-1.0);
+    __m256i          ewitab;
+    __m128i          ewitab_lo,ewitab_hi;
+    __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m256           dummy_mask,cutoff_mask;
+    __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
+    __m256           one     = _mm256_set1_ps(1.0);
+    __m256           two     = _mm256_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm256_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_ps(minus_one,_mm256_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
+    beta             = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
+    beta2            = _mm256_mul_ps(beta,beta);
+    beta3            = _mm256_mul_ps(beta,beta2);
+
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
+    iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
+    iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
+    vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+    vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm256_set1_ps(charge[inr+0]);
+    jq1              = _mm256_set1_ps(charge[inr+1]);
+    jq2              = _mm256_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm256_mul_ps(iq0,jq0);
+    c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
+    c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
+    c6grid_00        = _mm256_set1_ps(vdwgridioffsetptr0[vdwjidx0A]);
+    qq01             = _mm256_mul_ps(iq0,jq1);
+    qq02             = _mm256_mul_ps(iq0,jq2);
+    qq10             = _mm256_mul_ps(iq1,jq0);
+    qq11             = _mm256_mul_ps(iq1,jq1);
+    qq12             = _mm256_mul_ps(iq1,jq2);
+    qq20             = _mm256_mul_ps(iq2,jq0);
+    qq21             = _mm256_mul_ps(iq2,jq1);
+    qq22             = _mm256_mul_ps(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+    j_coord_offsetE = 0;
+    j_coord_offsetF = 0;
+    j_coord_offsetG = 0;
+    j_coord_offsetH = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                    &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm256_setzero_ps();
+        fiy0             = _mm256_setzero_ps();
+        fiz0             = _mm256_setzero_ps();
+        fix1             = _mm256_setzero_ps();
+        fiy1             = _mm256_setzero_ps();
+        fiz1             = _mm256_setzero_ps();
+        fix2             = _mm256_setzero_ps();
+        fiy2             = _mm256_setzero_ps();
+        fiz2             = _mm256_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm256_setzero_ps();
+        vvdwsum          = _mm256_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            jnrE             = jjnr[jidx+4];
+            jnrF             = jjnr[jidx+5];
+            jnrG             = jjnr[jidx+6];
+            jnrH             = jjnr[jidx+7];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+            dx01             = _mm256_sub_ps(ix0,jx1);
+            dy01             = _mm256_sub_ps(iy0,jy1);
+            dz01             = _mm256_sub_ps(iz0,jz1);
+            dx02             = _mm256_sub_ps(ix0,jx2);
+            dy02             = _mm256_sub_ps(iy0,jy2);
+            dz02             = _mm256_sub_ps(iz0,jz2);
+            dx10             = _mm256_sub_ps(ix1,jx0);
+            dy10             = _mm256_sub_ps(iy1,jy0);
+            dz10             = _mm256_sub_ps(iz1,jz0);
+            dx11             = _mm256_sub_ps(ix1,jx1);
+            dy11             = _mm256_sub_ps(iy1,jy1);
+            dz11             = _mm256_sub_ps(iz1,jz1);
+            dx12             = _mm256_sub_ps(ix1,jx2);
+            dy12             = _mm256_sub_ps(iy1,jy2);
+            dz12             = _mm256_sub_ps(iz1,jz2);
+            dx20             = _mm256_sub_ps(ix2,jx0);
+            dy20             = _mm256_sub_ps(iy2,jy0);
+            dz20             = _mm256_sub_ps(iz2,jz0);
+            dx21             = _mm256_sub_ps(ix2,jx1);
+            dy21             = _mm256_sub_ps(iy2,jy1);
+            dz21             = _mm256_sub_ps(iz2,jz1);
+            dx22             = _mm256_sub_ps(ix2,jx2);
+            dy22             = _mm256_sub_ps(iy2,jy2);
+            dz22             = _mm256_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm256_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm256_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm256_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm256_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm256_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm256_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm256_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm256_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm256_setzero_ps();
+            fjy0             = _mm256_setzero_ps();
+            fjz0             = _mm256_setzero_ps();
+            fjx1             = _mm256_setzero_ps();
+            fjy1             = _mm256_setzero_ps();
+            fjz1             = _mm256_setzero_ps();
+            fjx2             = _mm256_setzero_ps();
+            fjy2             = _mm256_setzero_ps();
+            fjz2             = _mm256_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq00);
+            rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq00,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv00,pmecorrV);
+            velec            = _mm256_mul_ps(qq00,velec);
+            
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_ps(_mm256_sub_ps(c6_00,_mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm256_sub_ps(_mm256_mul_ps(vvdw12,one_twelfth),_mm256_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,_mm256_sub_ps(vvdw6,_mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_ps(velecsum,velec);
+            vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm256_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm256_mul_ps(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq01);
+            rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq01,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv01,pmecorrV);
+            velec            = _mm256_mul_ps(qq01,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx01);
+            ty               = _mm256_mul_ps(fscal,dy01);
+            tz               = _mm256_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm256_mul_ps(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq02);
+            rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq02,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv02,pmecorrV);
+            velec            = _mm256_mul_ps(qq02,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx02);
+            ty               = _mm256_mul_ps(fscal,dy02);
+            tz               = _mm256_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm256_mul_ps(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq10);
+            rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq10,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv10,pmecorrV);
+            velec            = _mm256_mul_ps(qq10,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx10);
+            ty               = _mm256_mul_ps(fscal,dy10);
+            tz               = _mm256_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm256_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq11);
+            rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq11,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv11,pmecorrV);
+            velec            = _mm256_mul_ps(qq11,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx11);
+            ty               = _mm256_mul_ps(fscal,dy11);
+            tz               = _mm256_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm256_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq12);
+            rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq12,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv12,pmecorrV);
+            velec            = _mm256_mul_ps(qq12,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx12);
+            ty               = _mm256_mul_ps(fscal,dy12);
+            tz               = _mm256_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm256_mul_ps(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq20);
+            rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq20,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv20,pmecorrV);
+            velec            = _mm256_mul_ps(qq20,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx20);
+            ty               = _mm256_mul_ps(fscal,dy20);
+            tz               = _mm256_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm256_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq21);
+            rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq21,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv21,pmecorrV);
+            velec            = _mm256_mul_ps(qq21,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx21);
+            ty               = _mm256_mul_ps(fscal,dy21);
+            tz               = _mm256_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm256_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq22);
+            rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq22,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv22,pmecorrV);
+            velec            = _mm256_mul_ps(qq22,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx22);
+            ty               = _mm256_mul_ps(fscal,dy22);
+            tz               = _mm256_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            fjptrE             = f+j_coord_offsetE;
+            fjptrF             = f+j_coord_offsetF;
+            fjptrG             = f+j_coord_offsetG;
+            fjptrH             = f+j_coord_offsetH;
+
+            gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
+                                                      fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 784 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            jnrlistE         = jjnr[jidx+4];
+            jnrlistF         = jjnr[jidx+5];
+            jnrlistG         = jjnr[jidx+6];
+            jnrlistH         = jjnr[jidx+7];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
+                                            gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
+                                            
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
+            jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
+            jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
+            jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+            dx01             = _mm256_sub_ps(ix0,jx1);
+            dy01             = _mm256_sub_ps(iy0,jy1);
+            dz01             = _mm256_sub_ps(iz0,jz1);
+            dx02             = _mm256_sub_ps(ix0,jx2);
+            dy02             = _mm256_sub_ps(iy0,jy2);
+            dz02             = _mm256_sub_ps(iz0,jz2);
+            dx10             = _mm256_sub_ps(ix1,jx0);
+            dy10             = _mm256_sub_ps(iy1,jy0);
+            dz10             = _mm256_sub_ps(iz1,jz0);
+            dx11             = _mm256_sub_ps(ix1,jx1);
+            dy11             = _mm256_sub_ps(iy1,jy1);
+            dz11             = _mm256_sub_ps(iz1,jz1);
+            dx12             = _mm256_sub_ps(ix1,jx2);
+            dy12             = _mm256_sub_ps(iy1,jy2);
+            dz12             = _mm256_sub_ps(iz1,jz2);
+            dx20             = _mm256_sub_ps(ix2,jx0);
+            dy20             = _mm256_sub_ps(iy2,jy0);
+            dz20             = _mm256_sub_ps(iz2,jz0);
+            dx21             = _mm256_sub_ps(ix2,jx1);
+            dy21             = _mm256_sub_ps(iy2,jy1);
+            dz21             = _mm256_sub_ps(iz2,jz1);
+            dx22             = _mm256_sub_ps(ix2,jx2);
+            dy22             = _mm256_sub_ps(iy2,jy2);
+            dz22             = _mm256_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm256_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm256_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm256_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm256_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm256_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm256_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm256_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm256_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm256_setzero_ps();
+            fjy0             = _mm256_setzero_ps();
+            fjz0             = _mm256_setzero_ps();
+            fjx1             = _mm256_setzero_ps();
+            fjy1             = _mm256_setzero_ps();
+            fjz1             = _mm256_setzero_ps();
+            fjx2             = _mm256_setzero_ps();
+            fjy2             = _mm256_setzero_ps();
+            fjz2             = _mm256_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+            r00              = _mm256_andnot_ps(dummy_mask,r00);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq00);
+            rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq00,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv00,pmecorrV);
+            velec            = _mm256_mul_ps(qq00,velec);
+            
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_ps(_mm256_sub_ps(c6_00,_mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm256_sub_ps(_mm256_mul_ps(vvdw12,one_twelfth),_mm256_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,_mm256_sub_ps(vvdw6,_mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+            vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm256_add_ps(felec,fvdw);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm256_mul_ps(rsq01,rinv01);
+            r01              = _mm256_andnot_ps(dummy_mask,r01);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq01);
+            rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq01,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv01,pmecorrV);
+            velec            = _mm256_mul_ps(qq01,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx01);
+            ty               = _mm256_mul_ps(fscal,dy01);
+            tz               = _mm256_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm256_mul_ps(rsq02,rinv02);
+            r02              = _mm256_andnot_ps(dummy_mask,r02);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq02);
+            rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq02,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv02,pmecorrV);
+            velec            = _mm256_mul_ps(qq02,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx02);
+            ty               = _mm256_mul_ps(fscal,dy02);
+            tz               = _mm256_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm256_mul_ps(rsq10,rinv10);
+            r10              = _mm256_andnot_ps(dummy_mask,r10);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq10);
+            rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq10,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv10,pmecorrV);
+            velec            = _mm256_mul_ps(qq10,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx10);
+            ty               = _mm256_mul_ps(fscal,dy10);
+            tz               = _mm256_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm256_mul_ps(rsq11,rinv11);
+            r11              = _mm256_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq11);
+            rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq11,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv11,pmecorrV);
+            velec            = _mm256_mul_ps(qq11,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx11);
+            ty               = _mm256_mul_ps(fscal,dy11);
+            tz               = _mm256_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm256_mul_ps(rsq12,rinv12);
+            r12              = _mm256_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq12);
+            rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq12,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv12,pmecorrV);
+            velec            = _mm256_mul_ps(qq12,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx12);
+            ty               = _mm256_mul_ps(fscal,dy12);
+            tz               = _mm256_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm256_mul_ps(rsq20,rinv20);
+            r20              = _mm256_andnot_ps(dummy_mask,r20);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq20);
+            rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq20,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv20,pmecorrV);
+            velec            = _mm256_mul_ps(qq20,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx20);
+            ty               = _mm256_mul_ps(fscal,dy20);
+            tz               = _mm256_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm256_mul_ps(rsq21,rinv21);
+            r21              = _mm256_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq21);
+            rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq21,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv21,pmecorrV);
+            velec            = _mm256_mul_ps(qq21,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx21);
+            ty               = _mm256_mul_ps(fscal,dy21);
+            tz               = _mm256_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm256_mul_ps(rsq22,rinv22);
+            r22              = _mm256_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq22);
+            rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq22,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv22,pmecorrV);
+            velec            = _mm256_mul_ps(qq22,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx22);
+            ty               = _mm256_mul_ps(fscal,dy22);
+            tz               = _mm256_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
+            fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
+            fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
+            fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
+
+            gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
+                                                      fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 793 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*793);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_avx_256_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_avx_256_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrE,jnrF,jnrG,jnrH;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
+    real             scratch[4*DIM];
+    __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    real *           vdwioffsetptr1;
+    real *           vdwgridioffsetptr1;
+    __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    real *           vdwioffsetptr2;
+    real *           vdwgridioffsetptr2;
+    __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
+    __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
+    __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
+    __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m256           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
+    __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
+    __m256           c6grid_00;
+    __m256           c6grid_01;
+    __m256           c6grid_02;
+    __m256           c6grid_10;
+    __m256           c6grid_11;
+    __m256           c6grid_12;
+    __m256           c6grid_20;
+    __m256           c6grid_21;
+    __m256           c6grid_22;
+    real             *vdwgridparam;
+    __m256           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256           one_half  = _mm256_set1_ps(0.5);
+    __m256           minus_one = _mm256_set1_ps(-1.0);
+    __m256i          ewitab;
+    __m128i          ewitab_lo,ewitab_hi;
+    __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m256           dummy_mask,cutoff_mask;
+    __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
+    __m256           one     = _mm256_set1_ps(1.0);
+    __m256           two     = _mm256_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm256_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_ps(minus_one,_mm256_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
+    beta             = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
+    beta2            = _mm256_mul_ps(beta,beta);
+    beta3            = _mm256_mul_ps(beta,beta2);
+
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
+    iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
+    iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
+    vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+    vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm256_set1_ps(charge[inr+0]);
+    jq1              = _mm256_set1_ps(charge[inr+1]);
+    jq2              = _mm256_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm256_mul_ps(iq0,jq0);
+    c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
+    c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
+    c6grid_00        = _mm256_set1_ps(vdwgridioffsetptr0[vdwjidx0A]);
+    qq01             = _mm256_mul_ps(iq0,jq1);
+    qq02             = _mm256_mul_ps(iq0,jq2);
+    qq10             = _mm256_mul_ps(iq1,jq0);
+    qq11             = _mm256_mul_ps(iq1,jq1);
+    qq12             = _mm256_mul_ps(iq1,jq2);
+    qq20             = _mm256_mul_ps(iq2,jq0);
+    qq21             = _mm256_mul_ps(iq2,jq1);
+    qq22             = _mm256_mul_ps(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+    j_coord_offsetE = 0;
+    j_coord_offsetF = 0;
+    j_coord_offsetG = 0;
+    j_coord_offsetH = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                    &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm256_setzero_ps();
+        fiy0             = _mm256_setzero_ps();
+        fiz0             = _mm256_setzero_ps();
+        fix1             = _mm256_setzero_ps();
+        fiy1             = _mm256_setzero_ps();
+        fiz1             = _mm256_setzero_ps();
+        fix2             = _mm256_setzero_ps();
+        fiy2             = _mm256_setzero_ps();
+        fiz2             = _mm256_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            jnrE             = jjnr[jidx+4];
+            jnrF             = jjnr[jidx+5];
+            jnrG             = jjnr[jidx+6];
+            jnrH             = jjnr[jidx+7];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+            dx01             = _mm256_sub_ps(ix0,jx1);
+            dy01             = _mm256_sub_ps(iy0,jy1);
+            dz01             = _mm256_sub_ps(iz0,jz1);
+            dx02             = _mm256_sub_ps(ix0,jx2);
+            dy02             = _mm256_sub_ps(iy0,jy2);
+            dz02             = _mm256_sub_ps(iz0,jz2);
+            dx10             = _mm256_sub_ps(ix1,jx0);
+            dy10             = _mm256_sub_ps(iy1,jy0);
+            dz10             = _mm256_sub_ps(iz1,jz0);
+            dx11             = _mm256_sub_ps(ix1,jx1);
+            dy11             = _mm256_sub_ps(iy1,jy1);
+            dz11             = _mm256_sub_ps(iz1,jz1);
+            dx12             = _mm256_sub_ps(ix1,jx2);
+            dy12             = _mm256_sub_ps(iy1,jy2);
+            dz12             = _mm256_sub_ps(iz1,jz2);
+            dx20             = _mm256_sub_ps(ix2,jx0);
+            dy20             = _mm256_sub_ps(iy2,jy0);
+            dz20             = _mm256_sub_ps(iz2,jz0);
+            dx21             = _mm256_sub_ps(ix2,jx1);
+            dy21             = _mm256_sub_ps(iy2,jy1);
+            dz21             = _mm256_sub_ps(iz2,jz1);
+            dx22             = _mm256_sub_ps(ix2,jx2);
+            dy22             = _mm256_sub_ps(iy2,jy2);
+            dz22             = _mm256_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm256_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm256_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm256_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm256_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm256_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm256_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm256_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm256_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm256_setzero_ps();
+            fjy0             = _mm256_setzero_ps();
+            fjz0             = _mm256_setzero_ps();
+            fjx1             = _mm256_setzero_ps();
+            fjy1             = _mm256_setzero_ps();
+            fjz1             = _mm256_setzero_ps();
+            fjx2             = _mm256_setzero_ps();
+            fjy2             = _mm256_setzero_ps();
+            fjz2             = _mm256_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq00);
+            rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq00,felec);
+            
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),_mm256_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = _mm256_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm256_mul_ps(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq01);
+            rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq01,felec);
+            
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx01);
+            ty               = _mm256_mul_ps(fscal,dy01);
+            tz               = _mm256_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm256_mul_ps(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq02);
+            rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq02,felec);
+            
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx02);
+            ty               = _mm256_mul_ps(fscal,dy02);
+            tz               = _mm256_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm256_mul_ps(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq10);
+            rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq10,felec);
+            
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx10);
+            ty               = _mm256_mul_ps(fscal,dy10);
+            tz               = _mm256_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm256_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq11);
+            rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq11,felec);
+            
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx11);
+            ty               = _mm256_mul_ps(fscal,dy11);
+            tz               = _mm256_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm256_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq12);
+            rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq12,felec);
+            
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx12);
+            ty               = _mm256_mul_ps(fscal,dy12);
+            tz               = _mm256_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm256_mul_ps(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq20);
+            rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq20,felec);
+            
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx20);
+            ty               = _mm256_mul_ps(fscal,dy20);
+            tz               = _mm256_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm256_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq21);
+            rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq21,felec);
+            
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx21);
+            ty               = _mm256_mul_ps(fscal,dy21);
+            tz               = _mm256_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm256_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq22);
+            rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq22,felec);
+            
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx22);
+            ty               = _mm256_mul_ps(fscal,dy22);
+            tz               = _mm256_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            fjptrE             = f+j_coord_offsetE;
+            fjptrF             = f+j_coord_offsetF;
+            fjptrG             = f+j_coord_offsetG;
+            fjptrH             = f+j_coord_offsetH;
+
+            gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
+                                                      fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 527 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            jnrlistE         = jjnr[jidx+4];
+            jnrlistF         = jjnr[jidx+5];
+            jnrlistG         = jjnr[jidx+6];
+            jnrlistH         = jjnr[jidx+7];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
+                                            gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
+                                            
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
+            jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
+            jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
+            jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+            dx01             = _mm256_sub_ps(ix0,jx1);
+            dy01             = _mm256_sub_ps(iy0,jy1);
+            dz01             = _mm256_sub_ps(iz0,jz1);
+            dx02             = _mm256_sub_ps(ix0,jx2);
+            dy02             = _mm256_sub_ps(iy0,jy2);
+            dz02             = _mm256_sub_ps(iz0,jz2);
+            dx10             = _mm256_sub_ps(ix1,jx0);
+            dy10             = _mm256_sub_ps(iy1,jy0);
+            dz10             = _mm256_sub_ps(iz1,jz0);
+            dx11             = _mm256_sub_ps(ix1,jx1);
+            dy11             = _mm256_sub_ps(iy1,jy1);
+            dz11             = _mm256_sub_ps(iz1,jz1);
+            dx12             = _mm256_sub_ps(ix1,jx2);
+            dy12             = _mm256_sub_ps(iy1,jy2);
+            dz12             = _mm256_sub_ps(iz1,jz2);
+            dx20             = _mm256_sub_ps(ix2,jx0);
+            dy20             = _mm256_sub_ps(iy2,jy0);
+            dz20             = _mm256_sub_ps(iz2,jz0);
+            dx21             = _mm256_sub_ps(ix2,jx1);
+            dy21             = _mm256_sub_ps(iy2,jy1);
+            dz21             = _mm256_sub_ps(iz2,jz1);
+            dx22             = _mm256_sub_ps(ix2,jx2);
+            dy22             = _mm256_sub_ps(iy2,jy2);
+            dz22             = _mm256_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm256_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm256_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm256_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm256_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm256_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm256_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm256_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm256_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm256_setzero_ps();
+            fjy0             = _mm256_setzero_ps();
+            fjz0             = _mm256_setzero_ps();
+            fjx1             = _mm256_setzero_ps();
+            fjy1             = _mm256_setzero_ps();
+            fjz1             = _mm256_setzero_ps();
+            fjx2             = _mm256_setzero_ps();
+            fjy2             = _mm256_setzero_ps();
+            fjz2             = _mm256_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+            r00              = _mm256_andnot_ps(dummy_mask,r00);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq00);
+            rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq00,felec);
+            
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),_mm256_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = _mm256_add_ps(felec,fvdw);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm256_mul_ps(rsq01,rinv01);
+            r01              = _mm256_andnot_ps(dummy_mask,r01);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq01);
+            rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq01,felec);
+            
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx01);
+            ty               = _mm256_mul_ps(fscal,dy01);
+            tz               = _mm256_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm256_mul_ps(rsq02,rinv02);
+            r02              = _mm256_andnot_ps(dummy_mask,r02);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq02);
+            rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq02,felec);
+            
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx02);
+            ty               = _mm256_mul_ps(fscal,dy02);
+            tz               = _mm256_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm256_mul_ps(rsq10,rinv10);
+            r10              = _mm256_andnot_ps(dummy_mask,r10);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq10);
+            rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq10,felec);
+            
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx10);
+            ty               = _mm256_mul_ps(fscal,dy10);
+            tz               = _mm256_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm256_mul_ps(rsq11,rinv11);
+            r11              = _mm256_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq11);
+            rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq11,felec);
+            
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx11);
+            ty               = _mm256_mul_ps(fscal,dy11);
+            tz               = _mm256_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm256_mul_ps(rsq12,rinv12);
+            r12              = _mm256_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq12);
+            rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq12,felec);
+            
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx12);
+            ty               = _mm256_mul_ps(fscal,dy12);
+            tz               = _mm256_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm256_mul_ps(rsq20,rinv20);
+            r20              = _mm256_andnot_ps(dummy_mask,r20);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq20);
+            rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq20,felec);
+            
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx20);
+            ty               = _mm256_mul_ps(fscal,dy20);
+            tz               = _mm256_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm256_mul_ps(rsq21,rinv21);
+            r21              = _mm256_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq21);
+            rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq21,felec);
+            
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx21);
+            ty               = _mm256_mul_ps(fscal,dy21);
+            tz               = _mm256_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm256_mul_ps(rsq22,rinv22);
+            r22              = _mm256_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq22);
+            rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq22,felec);
+            
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx22);
+            ty               = _mm256_mul_ps(fscal,dy22);
+            tz               = _mm256_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
+            fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
+            fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
+            fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
+
+            gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
+                                                      fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 536 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*536);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecEw_VdwLJEw_GeomW4P1_avx_256_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecEw_VdwLJEw_GeomW4P1_avx_256_single.c
new file mode 100644 (file)
index 0000000..0e362a5
--- /dev/null
@@ -0,0 +1,1530 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_256_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_256_single.h"
+#include "kernelutil_x86_avx_256_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_avx_256_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_avx_256_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrE,jnrF,jnrG,jnrH;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
+    real             scratch[4*DIM];
+    __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    real *           vdwioffsetptr1;
+    real *           vdwgridioffsetptr1;
+    __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    real *           vdwioffsetptr2;
+    real *           vdwgridioffsetptr2;
+    __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    real *           vdwioffsetptr3;
+    real *           vdwgridioffsetptr3;
+    __m256           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
+    __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m256           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m256           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
+    __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
+    __m256           c6grid_00;
+    __m256           c6grid_10;
+    __m256           c6grid_20;
+    __m256           c6grid_30;
+    real             *vdwgridparam;
+    __m256           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256           one_half  = _mm256_set1_ps(0.5);
+    __m256           minus_one = _mm256_set1_ps(-1.0);
+    __m256i          ewitab;
+    __m128i          ewitab_lo,ewitab_hi;
+    __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m256           dummy_mask,cutoff_mask;
+    __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
+    __m256           one     = _mm256_set1_ps(1.0);
+    __m256           two     = _mm256_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm256_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_ps(minus_one,_mm256_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
+    beta             = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
+    beta2            = _mm256_mul_ps(beta,beta);
+    beta3            = _mm256_mul_ps(beta,beta2);
+
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
+    iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
+    iq3              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
+    vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+    vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+    j_coord_offsetE = 0;
+    j_coord_offsetF = 0;
+    j_coord_offsetG = 0;
+    j_coord_offsetH = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                    &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm256_setzero_ps();
+        fiy0             = _mm256_setzero_ps();
+        fiz0             = _mm256_setzero_ps();
+        fix1             = _mm256_setzero_ps();
+        fiy1             = _mm256_setzero_ps();
+        fiz1             = _mm256_setzero_ps();
+        fix2             = _mm256_setzero_ps();
+        fiy2             = _mm256_setzero_ps();
+        fiz2             = _mm256_setzero_ps();
+        fix3             = _mm256_setzero_ps();
+        fiy3             = _mm256_setzero_ps();
+        fiz3             = _mm256_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm256_setzero_ps();
+        vvdwsum          = _mm256_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            jnrE             = jjnr[jidx+4];
+            jnrF             = jjnr[jidx+5];
+            jnrG             = jjnr[jidx+6];
+            jnrH             = jjnr[jidx+7];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+            dx10             = _mm256_sub_ps(ix1,jx0);
+            dy10             = _mm256_sub_ps(iy1,jy0);
+            dz10             = _mm256_sub_ps(iz1,jz0);
+            dx20             = _mm256_sub_ps(ix2,jx0);
+            dy20             = _mm256_sub_ps(iy2,jy0);
+            dz20             = _mm256_sub_ps(iz2,jz0);
+            dx30             = _mm256_sub_ps(ix3,jx0);
+            dy30             = _mm256_sub_ps(iy3,jy0);
+            dz30             = _mm256_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm256_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm256_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm256_invsqrt_ps(rsq30);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm256_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0,
+                                                                 charge+jnrE+0,charge+jnrF+0,
+                                                                 charge+jnrG+0,charge+jnrH+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+            vdwjidx0E        = 2*vdwtype[jnrE+0];
+            vdwjidx0F        = 2*vdwtype[jnrF+0];
+            vdwjidx0G        = 2*vdwtype[jnrG+0];
+            vdwjidx0H        = 2*vdwtype[jnrH+0];
+
+            fjx0             = _mm256_setzero_ps();
+            fjy0             = _mm256_setzero_ps();
+            fjz0             = _mm256_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            vdwioffsetptr0+vdwjidx0E,
+                                            vdwioffsetptr0+vdwjidx0F,
+                                            vdwioffsetptr0+vdwjidx0G,
+                                            vdwioffsetptr0+vdwjidx0H,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_8real_swizzle_ps(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D,
+                                                                  vdwgridioffsetptr0+vdwjidx0E,
+                                                                  vdwgridioffsetptr0+vdwjidx0F,
+                                                                  vdwgridioffsetptr0+vdwjidx0G,
+                                                                  vdwgridioffsetptr0+vdwjidx0H);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_ps(_mm256_sub_ps(c6_00,_mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm256_sub_ps(_mm256_mul_ps(vvdw12,one_twelfth),_mm256_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,_mm256_sub_ps(vvdw6,_mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm256_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm256_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq10);
+            rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq10,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv10,pmecorrV);
+            velec            = _mm256_mul_ps(qq10,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx10);
+            ty               = _mm256_mul_ps(fscal,dy10);
+            tz               = _mm256_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm256_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm256_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq20);
+            rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq20,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv20,pmecorrV);
+            velec            = _mm256_mul_ps(qq20,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx20);
+            ty               = _mm256_mul_ps(fscal,dy20);
+            tz               = _mm256_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm256_mul_ps(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm256_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq30);
+            rinv3            = _mm256_mul_ps(rinvsq30,rinv30);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq30,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv30,pmecorrV);
+            velec            = _mm256_mul_ps(qq30,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx30);
+            ty               = _mm256_mul_ps(fscal,dy30);
+            tz               = _mm256_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_ps(fix3,tx);
+            fiy3             = _mm256_add_ps(fiy3,ty);
+            fiz3             = _mm256_add_ps(fiz3,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            fjptrE             = f+j_coord_offsetE;
+            fjptrF             = f+j_coord_offsetF;
+            fjptrG             = f+j_coord_offsetG;
+            fjptrH             = f+j_coord_offsetH;
+
+            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 306 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            jnrlistE         = jjnr[jidx+4];
+            jnrlistF         = jjnr[jidx+5];
+            jnrlistG         = jjnr[jidx+6];
+            jnrlistH         = jjnr[jidx+7];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
+                                            gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
+                                            
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
+            jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
+            jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
+            jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+            dx10             = _mm256_sub_ps(ix1,jx0);
+            dy10             = _mm256_sub_ps(iy1,jy0);
+            dz10             = _mm256_sub_ps(iz1,jz0);
+            dx20             = _mm256_sub_ps(ix2,jx0);
+            dy20             = _mm256_sub_ps(iy2,jy0);
+            dz20             = _mm256_sub_ps(iz2,jz0);
+            dx30             = _mm256_sub_ps(ix3,jx0);
+            dy30             = _mm256_sub_ps(iy3,jy0);
+            dz30             = _mm256_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm256_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm256_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm256_invsqrt_ps(rsq30);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm256_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0,
+                                                                 charge+jnrE+0,charge+jnrF+0,
+                                                                 charge+jnrG+0,charge+jnrH+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+            vdwjidx0E        = 2*vdwtype[jnrE+0];
+            vdwjidx0F        = 2*vdwtype[jnrF+0];
+            vdwjidx0G        = 2*vdwtype[jnrG+0];
+            vdwjidx0H        = 2*vdwtype[jnrH+0];
+
+            fjx0             = _mm256_setzero_ps();
+            fjy0             = _mm256_setzero_ps();
+            fjz0             = _mm256_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+            r00              = _mm256_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            vdwioffsetptr0+vdwjidx0E,
+                                            vdwioffsetptr0+vdwjidx0F,
+                                            vdwioffsetptr0+vdwjidx0G,
+                                            vdwioffsetptr0+vdwjidx0H,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_8real_swizzle_ps(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D,
+                                                                  vdwgridioffsetptr0+vdwjidx0E,
+                                                                  vdwgridioffsetptr0+vdwjidx0F,
+                                                                  vdwgridioffsetptr0+vdwjidx0G,
+                                                                  vdwgridioffsetptr0+vdwjidx0H);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_ps(_mm256_sub_ps(c6_00,_mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm256_sub_ps(_mm256_mul_ps(vvdw12,one_twelfth),_mm256_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,_mm256_sub_ps(vvdw6,_mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm256_mul_ps(rsq10,rinv10);
+            r10              = _mm256_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm256_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq10);
+            rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq10,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv10,pmecorrV);
+            velec            = _mm256_mul_ps(qq10,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx10);
+            ty               = _mm256_mul_ps(fscal,dy10);
+            tz               = _mm256_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm256_mul_ps(rsq20,rinv20);
+            r20              = _mm256_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm256_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq20);
+            rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq20,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv20,pmecorrV);
+            velec            = _mm256_mul_ps(qq20,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx20);
+            ty               = _mm256_mul_ps(fscal,dy20);
+            tz               = _mm256_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm256_mul_ps(rsq30,rinv30);
+            r30              = _mm256_andnot_ps(dummy_mask,r30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm256_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq30);
+            rinv3            = _mm256_mul_ps(rinvsq30,rinv30);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq30,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv30,pmecorrV);
+            velec            = _mm256_mul_ps(qq30,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx30);
+            ty               = _mm256_mul_ps(fscal,dy30);
+            tz               = _mm256_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_ps(fix3,tx);
+            fiy3             = _mm256_add_ps(fiy3,ty);
+            fiz3             = _mm256_add_ps(fiz3,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
+            fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
+            fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
+            fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
+
+            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 310 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*310);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_avx_256_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_avx_256_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrE,jnrF,jnrG,jnrH;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
+    real             scratch[4*DIM];
+    __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    real *           vdwioffsetptr1;
+    real *           vdwgridioffsetptr1;
+    __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    real *           vdwioffsetptr2;
+    real *           vdwgridioffsetptr2;
+    __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    real *           vdwioffsetptr3;
+    real *           vdwgridioffsetptr3;
+    __m256           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
+    __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m256           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m256           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
+    __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
+    __m256           c6grid_00;
+    __m256           c6grid_10;
+    __m256           c6grid_20;
+    __m256           c6grid_30;
+    real             *vdwgridparam;
+    __m256           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256           one_half  = _mm256_set1_ps(0.5);
+    __m256           minus_one = _mm256_set1_ps(-1.0);
+    __m256i          ewitab;
+    __m128i          ewitab_lo,ewitab_hi;
+    __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m256           dummy_mask,cutoff_mask;
+    __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
+    __m256           one     = _mm256_set1_ps(1.0);
+    __m256           two     = _mm256_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm256_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_ps(minus_one,_mm256_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
+    beta             = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
+    beta2            = _mm256_mul_ps(beta,beta);
+    beta3            = _mm256_mul_ps(beta,beta2);
+
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
+    iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
+    iq3              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
+    vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+    vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+    j_coord_offsetE = 0;
+    j_coord_offsetF = 0;
+    j_coord_offsetG = 0;
+    j_coord_offsetH = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                    &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm256_setzero_ps();
+        fiy0             = _mm256_setzero_ps();
+        fiz0             = _mm256_setzero_ps();
+        fix1             = _mm256_setzero_ps();
+        fiy1             = _mm256_setzero_ps();
+        fiz1             = _mm256_setzero_ps();
+        fix2             = _mm256_setzero_ps();
+        fiy2             = _mm256_setzero_ps();
+        fiz2             = _mm256_setzero_ps();
+        fix3             = _mm256_setzero_ps();
+        fiy3             = _mm256_setzero_ps();
+        fiz3             = _mm256_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            jnrE             = jjnr[jidx+4];
+            jnrF             = jjnr[jidx+5];
+            jnrG             = jjnr[jidx+6];
+            jnrH             = jjnr[jidx+7];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+            dx10             = _mm256_sub_ps(ix1,jx0);
+            dy10             = _mm256_sub_ps(iy1,jy0);
+            dz10             = _mm256_sub_ps(iz1,jz0);
+            dx20             = _mm256_sub_ps(ix2,jx0);
+            dy20             = _mm256_sub_ps(iy2,jy0);
+            dz20             = _mm256_sub_ps(iz2,jz0);
+            dx30             = _mm256_sub_ps(ix3,jx0);
+            dy30             = _mm256_sub_ps(iy3,jy0);
+            dz30             = _mm256_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm256_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm256_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm256_invsqrt_ps(rsq30);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm256_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0,
+                                                                 charge+jnrE+0,charge+jnrF+0,
+                                                                 charge+jnrG+0,charge+jnrH+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+            vdwjidx0E        = 2*vdwtype[jnrE+0];
+            vdwjidx0F        = 2*vdwtype[jnrF+0];
+            vdwjidx0G        = 2*vdwtype[jnrG+0];
+            vdwjidx0H        = 2*vdwtype[jnrH+0];
+
+            fjx0             = _mm256_setzero_ps();
+            fjy0             = _mm256_setzero_ps();
+            fjz0             = _mm256_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            vdwioffsetptr0+vdwjidx0E,
+                                            vdwioffsetptr0+vdwjidx0F,
+                                            vdwioffsetptr0+vdwjidx0G,
+                                            vdwioffsetptr0+vdwjidx0H,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_8real_swizzle_ps(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D,
+                                                                  vdwgridioffsetptr0+vdwjidx0E,
+                                                                  vdwgridioffsetptr0+vdwjidx0F,
+                                                                  vdwgridioffsetptr0+vdwjidx0G,
+                                                                  vdwgridioffsetptr0+vdwjidx0H);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),_mm256_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm256_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm256_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq10);
+            rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq10,felec);
+            
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx10);
+            ty               = _mm256_mul_ps(fscal,dy10);
+            tz               = _mm256_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm256_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm256_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq20);
+            rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq20,felec);
+            
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx20);
+            ty               = _mm256_mul_ps(fscal,dy20);
+            tz               = _mm256_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm256_mul_ps(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm256_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq30);
+            rinv3            = _mm256_mul_ps(rinvsq30,rinv30);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq30,felec);
+            
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx30);
+            ty               = _mm256_mul_ps(fscal,dy30);
+            tz               = _mm256_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_ps(fix3,tx);
+            fiy3             = _mm256_add_ps(fiy3,ty);
+            fiz3             = _mm256_add_ps(fiz3,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            fjptrE             = f+j_coord_offsetE;
+            fjptrF             = f+j_coord_offsetF;
+            fjptrG             = f+j_coord_offsetG;
+            fjptrH             = f+j_coord_offsetH;
+
+            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 217 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            jnrlistE         = jjnr[jidx+4];
+            jnrlistF         = jjnr[jidx+5];
+            jnrlistG         = jjnr[jidx+6];
+            jnrlistH         = jjnr[jidx+7];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
+                                            gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
+                                            
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
+            jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
+            jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
+            jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+            dx10             = _mm256_sub_ps(ix1,jx0);
+            dy10             = _mm256_sub_ps(iy1,jy0);
+            dz10             = _mm256_sub_ps(iz1,jz0);
+            dx20             = _mm256_sub_ps(ix2,jx0);
+            dy20             = _mm256_sub_ps(iy2,jy0);
+            dz20             = _mm256_sub_ps(iz2,jz0);
+            dx30             = _mm256_sub_ps(ix3,jx0);
+            dy30             = _mm256_sub_ps(iy3,jy0);
+            dz30             = _mm256_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm256_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm256_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm256_invsqrt_ps(rsq30);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm256_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                                 charge+jnrC+0,charge+jnrD+0,
+                                                                 charge+jnrE+0,charge+jnrF+0,
+                                                                 charge+jnrG+0,charge+jnrH+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+            vdwjidx0E        = 2*vdwtype[jnrE+0];
+            vdwjidx0F        = 2*vdwtype[jnrF+0];
+            vdwjidx0G        = 2*vdwtype[jnrG+0];
+            vdwjidx0H        = 2*vdwtype[jnrH+0];
+
+            fjx0             = _mm256_setzero_ps();
+            fjy0             = _mm256_setzero_ps();
+            fjz0             = _mm256_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+            r00              = _mm256_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            vdwioffsetptr0+vdwjidx0E,
+                                            vdwioffsetptr0+vdwjidx0F,
+                                            vdwioffsetptr0+vdwjidx0G,
+                                            vdwioffsetptr0+vdwjidx0H,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_8real_swizzle_ps(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D,
+                                                                  vdwgridioffsetptr0+vdwjidx0E,
+                                                                  vdwgridioffsetptr0+vdwjidx0F,
+                                                                  vdwgridioffsetptr0+vdwjidx0G,
+                                                                  vdwgridioffsetptr0+vdwjidx0H);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),_mm256_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm256_mul_ps(rsq10,rinv10);
+            r10              = _mm256_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm256_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq10);
+            rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq10,felec);
+            
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx10);
+            ty               = _mm256_mul_ps(fscal,dy10);
+            tz               = _mm256_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm256_mul_ps(rsq20,rinv20);
+            r20              = _mm256_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm256_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq20);
+            rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq20,felec);
+            
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx20);
+            ty               = _mm256_mul_ps(fscal,dy20);
+            tz               = _mm256_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm256_mul_ps(rsq30,rinv30);
+            r30              = _mm256_andnot_ps(dummy_mask,r30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm256_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq30);
+            rinv3            = _mm256_mul_ps(rinvsq30,rinv30);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq30,felec);
+            
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx30);
+            ty               = _mm256_mul_ps(fscal,dy30);
+            tz               = _mm256_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_ps(fix3,tx);
+            fiy3             = _mm256_add_ps(fiy3,ty);
+            fiz3             = _mm256_add_ps(fiz3,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
+            fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
+            fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
+            fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
+
+            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 221 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*221);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecEw_VdwLJEw_GeomW4W4_avx_256_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecEw_VdwLJEw_GeomW4W4_avx_256_single.c
new file mode 100644 (file)
index 0000000..6eea3e8
--- /dev/null
@@ -0,0 +1,2490 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_256_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_256_single.h"
+#include "kernelutil_x86_avx_256_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_avx_256_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_avx_256_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrE,jnrF,jnrG,jnrH;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
+    real             scratch[4*DIM];
+    __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    real *           vdwioffsetptr1;
+    real *           vdwgridioffsetptr1;
+    __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    real *           vdwioffsetptr2;
+    real *           vdwgridioffsetptr2;
+    __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    real *           vdwioffsetptr3;
+    real *           vdwgridioffsetptr3;
+    __m256           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
+    __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
+    __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
+    __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
+    __m256           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m256           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m256           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m256           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m256           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m256           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m256           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
+    __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
+    __m256           c6grid_00;
+    __m256           c6grid_11;
+    __m256           c6grid_12;
+    __m256           c6grid_13;
+    __m256           c6grid_21;
+    __m256           c6grid_22;
+    __m256           c6grid_23;
+    __m256           c6grid_31;
+    __m256           c6grid_32;
+    __m256           c6grid_33;
+    real             *vdwgridparam;
+    __m256           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256           one_half  = _mm256_set1_ps(0.5);
+    __m256           minus_one = _mm256_set1_ps(-1.0);
+    __m256i          ewitab;
+    __m128i          ewitab_lo,ewitab_hi;
+    __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m256           dummy_mask,cutoff_mask;
+    __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
+    __m256           one     = _mm256_set1_ps(1.0);
+    __m256           two     = _mm256_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm256_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_ps(minus_one,_mm256_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
+    beta             = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
+    beta2            = _mm256_mul_ps(beta,beta);
+    beta3            = _mm256_mul_ps(beta,beta2);
+
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
+    iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
+    iq3              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
+    vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+    vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm256_set1_ps(charge[inr+1]);
+    jq2              = _mm256_set1_ps(charge[inr+2]);
+    jq3              = _mm256_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
+    c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
+    c6grid_00        = _mm256_set1_ps(vdwgridioffsetptr0[vdwjidx0A]);
+    qq11             = _mm256_mul_ps(iq1,jq1);
+    qq12             = _mm256_mul_ps(iq1,jq2);
+    qq13             = _mm256_mul_ps(iq1,jq3);
+    qq21             = _mm256_mul_ps(iq2,jq1);
+    qq22             = _mm256_mul_ps(iq2,jq2);
+    qq23             = _mm256_mul_ps(iq2,jq3);
+    qq31             = _mm256_mul_ps(iq3,jq1);
+    qq32             = _mm256_mul_ps(iq3,jq2);
+    qq33             = _mm256_mul_ps(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+    j_coord_offsetE = 0;
+    j_coord_offsetF = 0;
+    j_coord_offsetG = 0;
+    j_coord_offsetH = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                    &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm256_setzero_ps();
+        fiy0             = _mm256_setzero_ps();
+        fiz0             = _mm256_setzero_ps();
+        fix1             = _mm256_setzero_ps();
+        fiy1             = _mm256_setzero_ps();
+        fiz1             = _mm256_setzero_ps();
+        fix2             = _mm256_setzero_ps();
+        fiy2             = _mm256_setzero_ps();
+        fiz2             = _mm256_setzero_ps();
+        fix3             = _mm256_setzero_ps();
+        fiy3             = _mm256_setzero_ps();
+        fiz3             = _mm256_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm256_setzero_ps();
+        vvdwsum          = _mm256_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            jnrE             = jjnr[jidx+4];
+            jnrF             = jjnr[jidx+5];
+            jnrG             = jjnr[jidx+6];
+            jnrH             = jjnr[jidx+7];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                                 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                                 &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+            dx11             = _mm256_sub_ps(ix1,jx1);
+            dy11             = _mm256_sub_ps(iy1,jy1);
+            dz11             = _mm256_sub_ps(iz1,jz1);
+            dx12             = _mm256_sub_ps(ix1,jx2);
+            dy12             = _mm256_sub_ps(iy1,jy2);
+            dz12             = _mm256_sub_ps(iz1,jz2);
+            dx13             = _mm256_sub_ps(ix1,jx3);
+            dy13             = _mm256_sub_ps(iy1,jy3);
+            dz13             = _mm256_sub_ps(iz1,jz3);
+            dx21             = _mm256_sub_ps(ix2,jx1);
+            dy21             = _mm256_sub_ps(iy2,jy1);
+            dz21             = _mm256_sub_ps(iz2,jz1);
+            dx22             = _mm256_sub_ps(ix2,jx2);
+            dy22             = _mm256_sub_ps(iy2,jy2);
+            dz22             = _mm256_sub_ps(iz2,jz2);
+            dx23             = _mm256_sub_ps(ix2,jx3);
+            dy23             = _mm256_sub_ps(iy2,jy3);
+            dz23             = _mm256_sub_ps(iz2,jz3);
+            dx31             = _mm256_sub_ps(ix3,jx1);
+            dy31             = _mm256_sub_ps(iy3,jy1);
+            dz31             = _mm256_sub_ps(iz3,jz1);
+            dx32             = _mm256_sub_ps(ix3,jx2);
+            dy32             = _mm256_sub_ps(iy3,jy2);
+            dz32             = _mm256_sub_ps(iz3,jz2);
+            dx33             = _mm256_sub_ps(ix3,jx3);
+            dy33             = _mm256_sub_ps(iy3,jy3);
+            dz33             = _mm256_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm256_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm256_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm256_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm256_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm256_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm256_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm256_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm256_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm256_invsqrt_ps(rsq33);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+            rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm256_setzero_ps();
+            fjy0             = _mm256_setzero_ps();
+            fjz0             = _mm256_setzero_ps();
+            fjx1             = _mm256_setzero_ps();
+            fjy1             = _mm256_setzero_ps();
+            fjz1             = _mm256_setzero_ps();
+            fjx2             = _mm256_setzero_ps();
+            fjy2             = _mm256_setzero_ps();
+            fjz2             = _mm256_setzero_ps();
+            fjx3             = _mm256_setzero_ps();
+            fjy3             = _mm256_setzero_ps();
+            fjz3             = _mm256_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_ps(_mm256_sub_ps(c6_00,_mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm256_sub_ps(_mm256_mul_ps(vvdw12,one_twelfth),_mm256_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,_mm256_sub_ps(vvdw6,_mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm256_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq11);
+            rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq11,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv11,pmecorrV);
+            velec            = _mm256_mul_ps(qq11,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx11);
+            ty               = _mm256_mul_ps(fscal,dy11);
+            tz               = _mm256_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm256_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq12);
+            rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq12,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv12,pmecorrV);
+            velec            = _mm256_mul_ps(qq12,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx12);
+            ty               = _mm256_mul_ps(fscal,dy12);
+            tz               = _mm256_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm256_mul_ps(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq13);
+            rinv3            = _mm256_mul_ps(rinvsq13,rinv13);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq13,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv13,pmecorrV);
+            velec            = _mm256_mul_ps(qq13,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx13);
+            ty               = _mm256_mul_ps(fscal,dy13);
+            tz               = _mm256_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx3             = _mm256_add_ps(fjx3,tx);
+            fjy3             = _mm256_add_ps(fjy3,ty);
+            fjz3             = _mm256_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm256_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq21);
+            rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq21,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv21,pmecorrV);
+            velec            = _mm256_mul_ps(qq21,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx21);
+            ty               = _mm256_mul_ps(fscal,dy21);
+            tz               = _mm256_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm256_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq22);
+            rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq22,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv22,pmecorrV);
+            velec            = _mm256_mul_ps(qq22,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx22);
+            ty               = _mm256_mul_ps(fscal,dy22);
+            tz               = _mm256_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm256_mul_ps(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq23);
+            rinv3            = _mm256_mul_ps(rinvsq23,rinv23);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq23,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv23,pmecorrV);
+            velec            = _mm256_mul_ps(qq23,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx23);
+            ty               = _mm256_mul_ps(fscal,dy23);
+            tz               = _mm256_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx3             = _mm256_add_ps(fjx3,tx);
+            fjy3             = _mm256_add_ps(fjy3,ty);
+            fjz3             = _mm256_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm256_mul_ps(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq31);
+            rinv3            = _mm256_mul_ps(rinvsq31,rinv31);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq31,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv31,pmecorrV);
+            velec            = _mm256_mul_ps(qq31,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx31);
+            ty               = _mm256_mul_ps(fscal,dy31);
+            tz               = _mm256_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_ps(fix3,tx);
+            fiy3             = _mm256_add_ps(fiy3,ty);
+            fiz3             = _mm256_add_ps(fiz3,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm256_mul_ps(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq32);
+            rinv3            = _mm256_mul_ps(rinvsq32,rinv32);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq32,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv32,pmecorrV);
+            velec            = _mm256_mul_ps(qq32,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx32);
+            ty               = _mm256_mul_ps(fscal,dy32);
+            tz               = _mm256_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_ps(fix3,tx);
+            fiy3             = _mm256_add_ps(fiy3,ty);
+            fiz3             = _mm256_add_ps(fiz3,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm256_mul_ps(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq33);
+            rinv3            = _mm256_mul_ps(rinvsq33,rinv33);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq33,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv33,pmecorrV);
+            velec            = _mm256_mul_ps(qq33,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx33);
+            ty               = _mm256_mul_ps(fscal,dy33);
+            tz               = _mm256_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_ps(fix3,tx);
+            fiy3             = _mm256_add_ps(fiy3,ty);
+            fiz3             = _mm256_add_ps(fiz3,tz);
+
+            fjx3             = _mm256_add_ps(fjx3,tx);
+            fjy3             = _mm256_add_ps(fjy3,ty);
+            fjz3             = _mm256_add_ps(fjz3,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            fjptrE             = f+j_coord_offsetE;
+            fjptrF             = f+j_coord_offsetF;
+            fjptrG             = f+j_coord_offsetG;
+            fjptrH             = f+j_coord_offsetH;
+
+            gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
+                                                      fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                      fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 810 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            jnrlistE         = jjnr[jidx+4];
+            jnrlistF         = jjnr[jidx+5];
+            jnrlistG         = jjnr[jidx+6];
+            jnrlistH         = jjnr[jidx+7];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
+                                            gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
+                                            
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
+            jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
+            jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
+            jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                                 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                                 &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+            dx11             = _mm256_sub_ps(ix1,jx1);
+            dy11             = _mm256_sub_ps(iy1,jy1);
+            dz11             = _mm256_sub_ps(iz1,jz1);
+            dx12             = _mm256_sub_ps(ix1,jx2);
+            dy12             = _mm256_sub_ps(iy1,jy2);
+            dz12             = _mm256_sub_ps(iz1,jz2);
+            dx13             = _mm256_sub_ps(ix1,jx3);
+            dy13             = _mm256_sub_ps(iy1,jy3);
+            dz13             = _mm256_sub_ps(iz1,jz3);
+            dx21             = _mm256_sub_ps(ix2,jx1);
+            dy21             = _mm256_sub_ps(iy2,jy1);
+            dz21             = _mm256_sub_ps(iz2,jz1);
+            dx22             = _mm256_sub_ps(ix2,jx2);
+            dy22             = _mm256_sub_ps(iy2,jy2);
+            dz22             = _mm256_sub_ps(iz2,jz2);
+            dx23             = _mm256_sub_ps(ix2,jx3);
+            dy23             = _mm256_sub_ps(iy2,jy3);
+            dz23             = _mm256_sub_ps(iz2,jz3);
+            dx31             = _mm256_sub_ps(ix3,jx1);
+            dy31             = _mm256_sub_ps(iy3,jy1);
+            dz31             = _mm256_sub_ps(iz3,jz1);
+            dx32             = _mm256_sub_ps(ix3,jx2);
+            dy32             = _mm256_sub_ps(iy3,jy2);
+            dz32             = _mm256_sub_ps(iz3,jz2);
+            dx33             = _mm256_sub_ps(ix3,jx3);
+            dy33             = _mm256_sub_ps(iy3,jy3);
+            dz33             = _mm256_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm256_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm256_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm256_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm256_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm256_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm256_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm256_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm256_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm256_invsqrt_ps(rsq33);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+            rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm256_setzero_ps();
+            fjy0             = _mm256_setzero_ps();
+            fjz0             = _mm256_setzero_ps();
+            fjx1             = _mm256_setzero_ps();
+            fjy1             = _mm256_setzero_ps();
+            fjz1             = _mm256_setzero_ps();
+            fjx2             = _mm256_setzero_ps();
+            fjy2             = _mm256_setzero_ps();
+            fjz2             = _mm256_setzero_ps();
+            fjx3             = _mm256_setzero_ps();
+            fjy3             = _mm256_setzero_ps();
+            fjz3             = _mm256_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+            r00              = _mm256_andnot_ps(dummy_mask,r00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_ps(_mm256_sub_ps(c6_00,_mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm256_sub_ps(_mm256_mul_ps(vvdw12,one_twelfth),_mm256_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,_mm256_sub_ps(vvdw6,_mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm256_mul_ps(rsq11,rinv11);
+            r11              = _mm256_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq11);
+            rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq11,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv11,pmecorrV);
+            velec            = _mm256_mul_ps(qq11,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx11);
+            ty               = _mm256_mul_ps(fscal,dy11);
+            tz               = _mm256_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm256_mul_ps(rsq12,rinv12);
+            r12              = _mm256_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq12);
+            rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq12,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv12,pmecorrV);
+            velec            = _mm256_mul_ps(qq12,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx12);
+            ty               = _mm256_mul_ps(fscal,dy12);
+            tz               = _mm256_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm256_mul_ps(rsq13,rinv13);
+            r13              = _mm256_andnot_ps(dummy_mask,r13);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq13);
+            rinv3            = _mm256_mul_ps(rinvsq13,rinv13);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq13,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv13,pmecorrV);
+            velec            = _mm256_mul_ps(qq13,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx13);
+            ty               = _mm256_mul_ps(fscal,dy13);
+            tz               = _mm256_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx3             = _mm256_add_ps(fjx3,tx);
+            fjy3             = _mm256_add_ps(fjy3,ty);
+            fjz3             = _mm256_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm256_mul_ps(rsq21,rinv21);
+            r21              = _mm256_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq21);
+            rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq21,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv21,pmecorrV);
+            velec            = _mm256_mul_ps(qq21,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx21);
+            ty               = _mm256_mul_ps(fscal,dy21);
+            tz               = _mm256_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm256_mul_ps(rsq22,rinv22);
+            r22              = _mm256_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq22);
+            rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq22,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv22,pmecorrV);
+            velec            = _mm256_mul_ps(qq22,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx22);
+            ty               = _mm256_mul_ps(fscal,dy22);
+            tz               = _mm256_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm256_mul_ps(rsq23,rinv23);
+            r23              = _mm256_andnot_ps(dummy_mask,r23);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq23);
+            rinv3            = _mm256_mul_ps(rinvsq23,rinv23);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq23,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv23,pmecorrV);
+            velec            = _mm256_mul_ps(qq23,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx23);
+            ty               = _mm256_mul_ps(fscal,dy23);
+            tz               = _mm256_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx3             = _mm256_add_ps(fjx3,tx);
+            fjy3             = _mm256_add_ps(fjy3,ty);
+            fjz3             = _mm256_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm256_mul_ps(rsq31,rinv31);
+            r31              = _mm256_andnot_ps(dummy_mask,r31);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq31);
+            rinv3            = _mm256_mul_ps(rinvsq31,rinv31);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq31,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv31,pmecorrV);
+            velec            = _mm256_mul_ps(qq31,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx31);
+            ty               = _mm256_mul_ps(fscal,dy31);
+            tz               = _mm256_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_ps(fix3,tx);
+            fiy3             = _mm256_add_ps(fiy3,ty);
+            fiz3             = _mm256_add_ps(fiz3,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm256_mul_ps(rsq32,rinv32);
+            r32              = _mm256_andnot_ps(dummy_mask,r32);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq32);
+            rinv3            = _mm256_mul_ps(rinvsq32,rinv32);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq32,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv32,pmecorrV);
+            velec            = _mm256_mul_ps(qq32,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx32);
+            ty               = _mm256_mul_ps(fscal,dy32);
+            tz               = _mm256_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_ps(fix3,tx);
+            fiy3             = _mm256_add_ps(fiy3,ty);
+            fiz3             = _mm256_add_ps(fiz3,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm256_mul_ps(rsq33,rinv33);
+            r33              = _mm256_andnot_ps(dummy_mask,r33);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq33);
+            rinv3            = _mm256_mul_ps(rinvsq33,rinv33);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq33,felec);
+            pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
+            pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
+            velec            = _mm256_sub_ps(rinv33,pmecorrV);
+            velec            = _mm256_mul_ps(qq33,velec);
+            
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm256_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm256_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx33);
+            ty               = _mm256_mul_ps(fscal,dy33);
+            tz               = _mm256_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_ps(fix3,tx);
+            fiy3             = _mm256_add_ps(fiy3,ty);
+            fiz3             = _mm256_add_ps(fiz3,tz);
+
+            fjx3             = _mm256_add_ps(fjx3,tx);
+            fjy3             = _mm256_add_ps(fjy3,ty);
+            fjz3             = _mm256_add_ps(fjz3,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
+            fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
+            fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
+            fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
+
+            gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
+                                                      fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                      fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 820 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*820);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_avx_256_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_avx_256_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrE,jnrF,jnrG,jnrH;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
+    real             scratch[4*DIM];
+    __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    real *           vdwioffsetptr1;
+    real *           vdwgridioffsetptr1;
+    __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    real *           vdwioffsetptr2;
+    real *           vdwgridioffsetptr2;
+    __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    real *           vdwioffsetptr3;
+    real *           vdwgridioffsetptr3;
+    __m256           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
+    __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
+    __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
+    __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
+    __m256           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m256           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m256           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m256           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m256           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m256           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m256           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
+    __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
+    __m256           c6grid_00;
+    __m256           c6grid_11;
+    __m256           c6grid_12;
+    __m256           c6grid_13;
+    __m256           c6grid_21;
+    __m256           c6grid_22;
+    __m256           c6grid_23;
+    __m256           c6grid_31;
+    __m256           c6grid_32;
+    __m256           c6grid_33;
+    real             *vdwgridparam;
+    __m256           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256           one_half  = _mm256_set1_ps(0.5);
+    __m256           minus_one = _mm256_set1_ps(-1.0);
+    __m256i          ewitab;
+    __m128i          ewitab_lo,ewitab_hi;
+    __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
+    real             *ewtab;
+    __m256           dummy_mask,cutoff_mask;
+    __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
+    __m256           one     = _mm256_set1_ps(1.0);
+    __m256           two     = _mm256_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm256_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_ps(minus_one,_mm256_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
+    beta             = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
+    beta2            = _mm256_mul_ps(beta,beta);
+    beta3            = _mm256_mul_ps(beta,beta2);
+
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
+    iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
+    iq3              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
+    vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+    vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm256_set1_ps(charge[inr+1]);
+    jq2              = _mm256_set1_ps(charge[inr+2]);
+    jq3              = _mm256_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
+    c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
+    c6grid_00        = _mm256_set1_ps(vdwgridioffsetptr0[vdwjidx0A]);
+    qq11             = _mm256_mul_ps(iq1,jq1);
+    qq12             = _mm256_mul_ps(iq1,jq2);
+    qq13             = _mm256_mul_ps(iq1,jq3);
+    qq21             = _mm256_mul_ps(iq2,jq1);
+    qq22             = _mm256_mul_ps(iq2,jq2);
+    qq23             = _mm256_mul_ps(iq2,jq3);
+    qq31             = _mm256_mul_ps(iq3,jq1);
+    qq32             = _mm256_mul_ps(iq3,jq2);
+    qq33             = _mm256_mul_ps(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+    j_coord_offsetE = 0;
+    j_coord_offsetF = 0;
+    j_coord_offsetG = 0;
+    j_coord_offsetH = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                    &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm256_setzero_ps();
+        fiy0             = _mm256_setzero_ps();
+        fiz0             = _mm256_setzero_ps();
+        fix1             = _mm256_setzero_ps();
+        fiy1             = _mm256_setzero_ps();
+        fiz1             = _mm256_setzero_ps();
+        fix2             = _mm256_setzero_ps();
+        fiy2             = _mm256_setzero_ps();
+        fiz2             = _mm256_setzero_ps();
+        fix3             = _mm256_setzero_ps();
+        fiy3             = _mm256_setzero_ps();
+        fiz3             = _mm256_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            jnrE             = jjnr[jidx+4];
+            jnrF             = jjnr[jidx+5];
+            jnrG             = jjnr[jidx+6];
+            jnrH             = jjnr[jidx+7];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                                 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                                 &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+            dx11             = _mm256_sub_ps(ix1,jx1);
+            dy11             = _mm256_sub_ps(iy1,jy1);
+            dz11             = _mm256_sub_ps(iz1,jz1);
+            dx12             = _mm256_sub_ps(ix1,jx2);
+            dy12             = _mm256_sub_ps(iy1,jy2);
+            dz12             = _mm256_sub_ps(iz1,jz2);
+            dx13             = _mm256_sub_ps(ix1,jx3);
+            dy13             = _mm256_sub_ps(iy1,jy3);
+            dz13             = _mm256_sub_ps(iz1,jz3);
+            dx21             = _mm256_sub_ps(ix2,jx1);
+            dy21             = _mm256_sub_ps(iy2,jy1);
+            dz21             = _mm256_sub_ps(iz2,jz1);
+            dx22             = _mm256_sub_ps(ix2,jx2);
+            dy22             = _mm256_sub_ps(iy2,jy2);
+            dz22             = _mm256_sub_ps(iz2,jz2);
+            dx23             = _mm256_sub_ps(ix2,jx3);
+            dy23             = _mm256_sub_ps(iy2,jy3);
+            dz23             = _mm256_sub_ps(iz2,jz3);
+            dx31             = _mm256_sub_ps(ix3,jx1);
+            dy31             = _mm256_sub_ps(iy3,jy1);
+            dz31             = _mm256_sub_ps(iz3,jz1);
+            dx32             = _mm256_sub_ps(ix3,jx2);
+            dy32             = _mm256_sub_ps(iy3,jy2);
+            dz32             = _mm256_sub_ps(iz3,jz2);
+            dx33             = _mm256_sub_ps(ix3,jx3);
+            dy33             = _mm256_sub_ps(iy3,jy3);
+            dz33             = _mm256_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm256_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm256_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm256_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm256_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm256_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm256_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm256_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm256_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm256_invsqrt_ps(rsq33);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+            rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm256_setzero_ps();
+            fjy0             = _mm256_setzero_ps();
+            fjz0             = _mm256_setzero_ps();
+            fjx1             = _mm256_setzero_ps();
+            fjy1             = _mm256_setzero_ps();
+            fjz1             = _mm256_setzero_ps();
+            fjx2             = _mm256_setzero_ps();
+            fjy2             = _mm256_setzero_ps();
+            fjz2             = _mm256_setzero_ps();
+            fjx3             = _mm256_setzero_ps();
+            fjy3             = _mm256_setzero_ps();
+            fjz3             = _mm256_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),_mm256_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm256_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq11);
+            rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq11,felec);
+            
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx11);
+            ty               = _mm256_mul_ps(fscal,dy11);
+            tz               = _mm256_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm256_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq12);
+            rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq12,felec);
+            
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx12);
+            ty               = _mm256_mul_ps(fscal,dy12);
+            tz               = _mm256_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm256_mul_ps(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq13);
+            rinv3            = _mm256_mul_ps(rinvsq13,rinv13);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq13,felec);
+            
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx13);
+            ty               = _mm256_mul_ps(fscal,dy13);
+            tz               = _mm256_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx3             = _mm256_add_ps(fjx3,tx);
+            fjy3             = _mm256_add_ps(fjy3,ty);
+            fjz3             = _mm256_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm256_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq21);
+            rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq21,felec);
+            
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx21);
+            ty               = _mm256_mul_ps(fscal,dy21);
+            tz               = _mm256_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm256_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq22);
+            rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq22,felec);
+            
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx22);
+            ty               = _mm256_mul_ps(fscal,dy22);
+            tz               = _mm256_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm256_mul_ps(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq23);
+            rinv3            = _mm256_mul_ps(rinvsq23,rinv23);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq23,felec);
+            
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx23);
+            ty               = _mm256_mul_ps(fscal,dy23);
+            tz               = _mm256_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx3             = _mm256_add_ps(fjx3,tx);
+            fjy3             = _mm256_add_ps(fjy3,ty);
+            fjz3             = _mm256_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm256_mul_ps(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq31);
+            rinv3            = _mm256_mul_ps(rinvsq31,rinv31);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq31,felec);
+            
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx31);
+            ty               = _mm256_mul_ps(fscal,dy31);
+            tz               = _mm256_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_ps(fix3,tx);
+            fiy3             = _mm256_add_ps(fiy3,ty);
+            fiz3             = _mm256_add_ps(fiz3,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm256_mul_ps(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq32);
+            rinv3            = _mm256_mul_ps(rinvsq32,rinv32);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq32,felec);
+            
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx32);
+            ty               = _mm256_mul_ps(fscal,dy32);
+            tz               = _mm256_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_ps(fix3,tx);
+            fiy3             = _mm256_add_ps(fiy3,ty);
+            fiz3             = _mm256_add_ps(fiz3,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm256_mul_ps(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq33);
+            rinv3            = _mm256_mul_ps(rinvsq33,rinv33);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq33,felec);
+            
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx33);
+            ty               = _mm256_mul_ps(fscal,dy33);
+            tz               = _mm256_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_ps(fix3,tx);
+            fiy3             = _mm256_add_ps(fiy3,ty);
+            fiz3             = _mm256_add_ps(fiz3,tz);
+
+            fjx3             = _mm256_add_ps(fjx3,tx);
+            fjy3             = _mm256_add_ps(fjy3,ty);
+            fjz3             = _mm256_add_ps(fjz3,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            fjptrE             = f+j_coord_offsetE;
+            fjptrF             = f+j_coord_offsetF;
+            fjptrG             = f+j_coord_offsetG;
+            fjptrH             = f+j_coord_offsetH;
+
+            gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
+                                                      fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                      fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 553 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            jnrlistE         = jjnr[jidx+4];
+            jnrlistF         = jjnr[jidx+5];
+            jnrlistG         = jjnr[jidx+6];
+            jnrlistH         = jjnr[jidx+7];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
+                                            gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
+                                            
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
+            jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
+            jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
+            jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                                 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                                 &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+            dx11             = _mm256_sub_ps(ix1,jx1);
+            dy11             = _mm256_sub_ps(iy1,jy1);
+            dz11             = _mm256_sub_ps(iz1,jz1);
+            dx12             = _mm256_sub_ps(ix1,jx2);
+            dy12             = _mm256_sub_ps(iy1,jy2);
+            dz12             = _mm256_sub_ps(iz1,jz2);
+            dx13             = _mm256_sub_ps(ix1,jx3);
+            dy13             = _mm256_sub_ps(iy1,jy3);
+            dz13             = _mm256_sub_ps(iz1,jz3);
+            dx21             = _mm256_sub_ps(ix2,jx1);
+            dy21             = _mm256_sub_ps(iy2,jy1);
+            dz21             = _mm256_sub_ps(iz2,jz1);
+            dx22             = _mm256_sub_ps(ix2,jx2);
+            dy22             = _mm256_sub_ps(iy2,jy2);
+            dz22             = _mm256_sub_ps(iz2,jz2);
+            dx23             = _mm256_sub_ps(ix2,jx3);
+            dy23             = _mm256_sub_ps(iy2,jy3);
+            dz23             = _mm256_sub_ps(iz2,jz3);
+            dx31             = _mm256_sub_ps(ix3,jx1);
+            dy31             = _mm256_sub_ps(iy3,jy1);
+            dz31             = _mm256_sub_ps(iz3,jz1);
+            dx32             = _mm256_sub_ps(ix3,jx2);
+            dy32             = _mm256_sub_ps(iy3,jy2);
+            dz32             = _mm256_sub_ps(iz3,jz2);
+            dx33             = _mm256_sub_ps(ix3,jx3);
+            dy33             = _mm256_sub_ps(iy3,jy3);
+            dz33             = _mm256_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm256_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm256_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm256_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm256_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm256_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm256_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm256_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm256_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm256_invsqrt_ps(rsq33);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+            rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm256_setzero_ps();
+            fjy0             = _mm256_setzero_ps();
+            fjz0             = _mm256_setzero_ps();
+            fjx1             = _mm256_setzero_ps();
+            fjy1             = _mm256_setzero_ps();
+            fjz1             = _mm256_setzero_ps();
+            fjx2             = _mm256_setzero_ps();
+            fjy2             = _mm256_setzero_ps();
+            fjz2             = _mm256_setzero_ps();
+            fjx3             = _mm256_setzero_ps();
+            fjy3             = _mm256_setzero_ps();
+            fjz3             = _mm256_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+            r00              = _mm256_andnot_ps(dummy_mask,r00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),_mm256_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjx0             = _mm256_add_ps(fjx0,tx);
+            fjy0             = _mm256_add_ps(fjy0,ty);
+            fjz0             = _mm256_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm256_mul_ps(rsq11,rinv11);
+            r11              = _mm256_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq11);
+            rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq11,felec);
+            
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx11);
+            ty               = _mm256_mul_ps(fscal,dy11);
+            tz               = _mm256_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm256_mul_ps(rsq12,rinv12);
+            r12              = _mm256_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq12);
+            rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq12,felec);
+            
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx12);
+            ty               = _mm256_mul_ps(fscal,dy12);
+            tz               = _mm256_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm256_mul_ps(rsq13,rinv13);
+            r13              = _mm256_andnot_ps(dummy_mask,r13);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq13);
+            rinv3            = _mm256_mul_ps(rinvsq13,rinv13);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq13,felec);
+            
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx13);
+            ty               = _mm256_mul_ps(fscal,dy13);
+            tz               = _mm256_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm256_add_ps(fix1,tx);
+            fiy1             = _mm256_add_ps(fiy1,ty);
+            fiz1             = _mm256_add_ps(fiz1,tz);
+
+            fjx3             = _mm256_add_ps(fjx3,tx);
+            fjy3             = _mm256_add_ps(fjy3,ty);
+            fjz3             = _mm256_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm256_mul_ps(rsq21,rinv21);
+            r21              = _mm256_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq21);
+            rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq21,felec);
+            
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx21);
+            ty               = _mm256_mul_ps(fscal,dy21);
+            tz               = _mm256_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm256_mul_ps(rsq22,rinv22);
+            r22              = _mm256_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq22);
+            rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq22,felec);
+            
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx22);
+            ty               = _mm256_mul_ps(fscal,dy22);
+            tz               = _mm256_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm256_mul_ps(rsq23,rinv23);
+            r23              = _mm256_andnot_ps(dummy_mask,r23);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq23);
+            rinv3            = _mm256_mul_ps(rinvsq23,rinv23);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq23,felec);
+            
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx23);
+            ty               = _mm256_mul_ps(fscal,dy23);
+            tz               = _mm256_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm256_add_ps(fix2,tx);
+            fiy2             = _mm256_add_ps(fiy2,ty);
+            fiz2             = _mm256_add_ps(fiz2,tz);
+
+            fjx3             = _mm256_add_ps(fjx3,tx);
+            fjy3             = _mm256_add_ps(fjy3,ty);
+            fjz3             = _mm256_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm256_mul_ps(rsq31,rinv31);
+            r31              = _mm256_andnot_ps(dummy_mask,r31);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq31);
+            rinv3            = _mm256_mul_ps(rinvsq31,rinv31);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq31,felec);
+            
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx31);
+            ty               = _mm256_mul_ps(fscal,dy31);
+            tz               = _mm256_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_ps(fix3,tx);
+            fiy3             = _mm256_add_ps(fiy3,ty);
+            fiz3             = _mm256_add_ps(fiz3,tz);
+
+            fjx1             = _mm256_add_ps(fjx1,tx);
+            fjy1             = _mm256_add_ps(fjy1,ty);
+            fjz1             = _mm256_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm256_mul_ps(rsq32,rinv32);
+            r32              = _mm256_andnot_ps(dummy_mask,r32);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq32);
+            rinv3            = _mm256_mul_ps(rinvsq32,rinv32);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq32,felec);
+            
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx32);
+            ty               = _mm256_mul_ps(fscal,dy32);
+            tz               = _mm256_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_ps(fix3,tx);
+            fiy3             = _mm256_add_ps(fiy3,ty);
+            fiz3             = _mm256_add_ps(fiz3,tz);
+
+            fjx2             = _mm256_add_ps(fjx2,tx);
+            fjy2             = _mm256_add_ps(fjy2,ty);
+            fjz2             = _mm256_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm256_mul_ps(rsq33,rinv33);
+            r33              = _mm256_andnot_ps(dummy_mask,r33);
+
+            /* EWALD ELECTROSTATICS */
+            
+            /* Analytical PME correction */
+            zeta2            = _mm256_mul_ps(beta2,rsq33);
+            rinv3            = _mm256_mul_ps(rinvsq33,rinv33);
+            pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
+            felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
+            felec            = _mm256_mul_ps(qq33,felec);
+            
+            fscal            = felec;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx33);
+            ty               = _mm256_mul_ps(fscal,dy33);
+            tz               = _mm256_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm256_add_ps(fix3,tx);
+            fiy3             = _mm256_add_ps(fiy3,ty);
+            fiz3             = _mm256_add_ps(fiz3,tz);
+
+            fjx3             = _mm256_add_ps(fjx3,tx);
+            fjy3             = _mm256_add_ps(fjy3,ty);
+            fjz3             = _mm256_add_ps(fjz3,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
+            fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
+            fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
+            fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
+
+            gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
+                                                      fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                      fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 563 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*563);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_avx_256_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_avx_256_single.c
new file mode 100644 (file)
index 0000000..2f49769
--- /dev/null
@@ -0,0 +1,893 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_256_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_256_single.h"
+#include "kernelutil_x86_avx_256_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_avx_256_single
+ * Electrostatics interaction: None
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_avx_256_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrE,jnrF,jnrG,jnrH;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
+    real             scratch[4*DIM];
+    __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
+    __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
+    __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
+    __m256           c6grid_00;
+    real             *vdwgridparam;
+    __m256           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256           one_half  = _mm256_set1_ps(0.5);
+    __m256           minus_one = _mm256_set1_ps(-1.0);
+    __m256           dummy_mask,cutoff_mask;
+    __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
+    __m256           one     = _mm256_set1_ps(1.0);
+    __m256           two     = _mm256_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_ps(minus_one,_mm256_mul_ps(ewclj,ewclj));
+
+    rcutoff_scalar   = fr->rvdw;
+    rcutoff          = _mm256_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm256_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm256_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+    j_coord_offsetE = 0;
+    j_coord_offsetF = 0;
+    j_coord_offsetG = 0;
+    j_coord_offsetH = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm256_setzero_ps();
+        fiy0             = _mm256_setzero_ps();
+        fiz0             = _mm256_setzero_ps();
+
+        /* Load parameters for i particles */
+        vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+        vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        vvdwsum          = _mm256_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            jnrE             = jjnr[jidx+4];
+            jnrF             = jjnr[jidx+5];
+            jnrG             = jjnr[jidx+6];
+            jnrH             = jjnr[jidx+7];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+            vdwjidx0E        = 2*vdwtype[jnrE+0];
+            vdwjidx0F        = 2*vdwtype[jnrF+0];
+            vdwjidx0G        = 2*vdwtype[jnrG+0];
+            vdwjidx0H        = 2*vdwtype[jnrH+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            vdwioffsetptr0+vdwjidx0E,
+                                            vdwioffsetptr0+vdwjidx0F,
+                                            vdwioffsetptr0+vdwjidx0G,
+                                            vdwioffsetptr0+vdwjidx0H,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_8real_swizzle_ps(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D,
+                                                                  vdwgridioffsetptr0+vdwjidx0E,
+                                                                  vdwgridioffsetptr0+vdwjidx0F,
+                                                                  vdwgridioffsetptr0+vdwjidx0G,
+                                                                  vdwgridioffsetptr0+vdwjidx0H);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_ps(_mm256_sub_ps(c6_00,_mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
+           vvdw             = _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12 , _mm256_mul_ps(c12_00,_mm256_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm256_mul_ps( _mm256_sub_ps(vvdw6,_mm256_add_ps(_mm256_mul_ps(c6_00,sh_vdw_invrcut6),_mm256_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,_mm256_sub_ps(vvdw6,_mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm256_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            fjptrE             = f+j_coord_offsetE;
+            fjptrF             = f+j_coord_offsetF;
+            fjptrG             = f+j_coord_offsetG;
+            fjptrH             = f+j_coord_offsetH;
+            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 62 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            jnrlistE         = jjnr[jidx+4];
+            jnrlistF         = jjnr[jidx+5];
+            jnrlistG         = jjnr[jidx+6];
+            jnrlistH         = jjnr[jidx+7];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
+                                            gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
+                                            
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
+            jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
+            jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
+            jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+            vdwjidx0E        = 2*vdwtype[jnrE+0];
+            vdwjidx0F        = 2*vdwtype[jnrF+0];
+            vdwjidx0G        = 2*vdwtype[jnrG+0];
+            vdwjidx0H        = 2*vdwtype[jnrH+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+            r00              = _mm256_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            vdwioffsetptr0+vdwjidx0E,
+                                            vdwioffsetptr0+vdwjidx0F,
+                                            vdwioffsetptr0+vdwjidx0G,
+                                            vdwioffsetptr0+vdwjidx0H,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_8real_swizzle_ps(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D,
+                                                                  vdwgridioffsetptr0+vdwjidx0E,
+                                                                  vdwgridioffsetptr0+vdwjidx0F,
+                                                                  vdwgridioffsetptr0+vdwjidx0G,
+                                                                  vdwgridioffsetptr0+vdwjidx0H);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_ps(_mm256_sub_ps(c6_00,_mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
+           vvdw             = _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12 , _mm256_mul_ps(c12_00,_mm256_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm256_mul_ps( _mm256_sub_ps(vvdw6,_mm256_add_ps(_mm256_mul_ps(c6_00,sh_vdw_invrcut6),_mm256_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,_mm256_sub_ps(vvdw6,_mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm256_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
+            fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
+            fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
+            fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
+            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 63 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*63);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_avx_256_single
+ * Electrostatics interaction: None
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_avx_256_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrE,jnrF,jnrG,jnrH;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
+    real             scratch[4*DIM];
+    __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
+    __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
+    __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
+    __m256           c6grid_00;
+    real             *vdwgridparam;
+    __m256           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256           one_half  = _mm256_set1_ps(0.5);
+    __m256           minus_one = _mm256_set1_ps(-1.0);
+    __m256           dummy_mask,cutoff_mask;
+    __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
+    __m256           one     = _mm256_set1_ps(1.0);
+    __m256           two     = _mm256_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_ps(minus_one,_mm256_mul_ps(ewclj,ewclj));
+
+    rcutoff_scalar   = fr->rvdw;
+    rcutoff          = _mm256_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm256_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm256_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+    j_coord_offsetE = 0;
+    j_coord_offsetF = 0;
+    j_coord_offsetG = 0;
+    j_coord_offsetH = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm256_setzero_ps();
+        fiy0             = _mm256_setzero_ps();
+        fiz0             = _mm256_setzero_ps();
+
+        /* Load parameters for i particles */
+        vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+        vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            jnrE             = jjnr[jidx+4];
+            jnrF             = jjnr[jidx+5];
+            jnrG             = jjnr[jidx+6];
+            jnrH             = jjnr[jidx+7];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+            vdwjidx0E        = 2*vdwtype[jnrE+0];
+            vdwjidx0F        = 2*vdwtype[jnrF+0];
+            vdwjidx0G        = 2*vdwtype[jnrG+0];
+            vdwjidx0H        = 2*vdwtype[jnrH+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            vdwioffsetptr0+vdwjidx0E,
+                                            vdwioffsetptr0+vdwjidx0F,
+                                            vdwioffsetptr0+vdwjidx0G,
+                                            vdwioffsetptr0+vdwjidx0H,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_8real_swizzle_ps(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D,
+                                                                  vdwgridioffsetptr0+vdwjidx0E,
+                                                                  vdwgridioffsetptr0+vdwjidx0F,
+                                                                  vdwgridioffsetptr0+vdwjidx0G,
+                                                                  vdwgridioffsetptr0+vdwjidx0H);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),_mm256_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = fvdw;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            fjptrE             = f+j_coord_offsetE;
+            fjptrF             = f+j_coord_offsetF;
+            fjptrG             = f+j_coord_offsetG;
+            fjptrH             = f+j_coord_offsetH;
+            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 49 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            jnrlistE         = jjnr[jidx+4];
+            jnrlistF         = jjnr[jidx+5];
+            jnrlistG         = jjnr[jidx+6];
+            jnrlistH         = jjnr[jidx+7];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
+                                            gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
+                                            
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
+            jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
+            jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
+            jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+            vdwjidx0E        = 2*vdwtype[jnrE+0];
+            vdwjidx0F        = 2*vdwtype[jnrF+0];
+            vdwjidx0G        = 2*vdwtype[jnrG+0];
+            vdwjidx0H        = 2*vdwtype[jnrH+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm256_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+            r00              = _mm256_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            vdwioffsetptr0+vdwjidx0E,
+                                            vdwioffsetptr0+vdwjidx0F,
+                                            vdwioffsetptr0+vdwjidx0G,
+                                            vdwioffsetptr0+vdwjidx0H,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_8real_swizzle_ps(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D,
+                                                                  vdwgridioffsetptr0+vdwjidx0E,
+                                                                  vdwgridioffsetptr0+vdwjidx0F,
+                                                                  vdwgridioffsetptr0+vdwjidx0G,
+                                                                  vdwgridioffsetptr0+vdwjidx0H);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),_mm256_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
+
+            fscal            = fvdw;
+
+            fscal            = _mm256_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
+            fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
+            fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
+            fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
+            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 50 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 6 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*6 + inneriter*50);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecNone_VdwLJEw_GeomP1P1_avx_256_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecNone_VdwLJEw_GeomP1P1_avx_256_single.c
new file mode 100644 (file)
index 0000000..465393a
--- /dev/null
@@ -0,0 +1,839 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS avx_256_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_avx_256_single.h"
+#include "kernelutil_x86_avx_256_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_avx_256_single
+ * Electrostatics interaction: None
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_avx_256_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrE,jnrF,jnrG,jnrH;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
+    real             scratch[4*DIM];
+    __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
+    __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
+    __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
+    __m256           c6grid_00;
+    real             *vdwgridparam;
+    __m256           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256           one_half  = _mm256_set1_ps(0.5);
+    __m256           minus_one = _mm256_set1_ps(-1.0);
+    __m256           dummy_mask,cutoff_mask;
+    __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
+    __m256           one     = _mm256_set1_ps(1.0);
+    __m256           two     = _mm256_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_ps(minus_one,_mm256_mul_ps(ewclj,ewclj));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+    j_coord_offsetE = 0;
+    j_coord_offsetF = 0;
+    j_coord_offsetG = 0;
+    j_coord_offsetH = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm256_setzero_ps();
+        fiy0             = _mm256_setzero_ps();
+        fiz0             = _mm256_setzero_ps();
+
+        /* Load parameters for i particles */
+        vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+        vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        vvdwsum          = _mm256_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            jnrE             = jjnr[jidx+4];
+            jnrF             = jjnr[jidx+5];
+            jnrG             = jjnr[jidx+6];
+            jnrH             = jjnr[jidx+7];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+            vdwjidx0E        = 2*vdwtype[jnrE+0];
+            vdwjidx0F        = 2*vdwtype[jnrF+0];
+            vdwjidx0G        = 2*vdwtype[jnrG+0];
+            vdwjidx0H        = 2*vdwtype[jnrH+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            vdwioffsetptr0+vdwjidx0E,
+                                            vdwioffsetptr0+vdwjidx0F,
+                                            vdwioffsetptr0+vdwjidx0G,
+                                            vdwioffsetptr0+vdwjidx0H,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_8real_swizzle_ps(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D,
+                                                                  vdwgridioffsetptr0+vdwjidx0E,
+                                                                  vdwgridioffsetptr0+vdwjidx0F,
+                                                                  vdwgridioffsetptr0+vdwjidx0G,
+                                                                  vdwgridioffsetptr0+vdwjidx0H);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_ps(_mm256_sub_ps(c6_00,_mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm256_sub_ps(_mm256_mul_ps(vvdw12,one_twelfth),_mm256_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,_mm256_sub_ps(vvdw6,_mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            fjptrE             = f+j_coord_offsetE;
+            fjptrF             = f+j_coord_offsetF;
+            fjptrG             = f+j_coord_offsetG;
+            fjptrH             = f+j_coord_offsetH;
+            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
+
+            /* Inner loop uses 51 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            jnrlistE         = jjnr[jidx+4];
+            jnrlistF         = jjnr[jidx+5];
+            jnrlistG         = jjnr[jidx+6];
+            jnrlistH         = jjnr[jidx+7];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
+                                            gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
+                                            
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
+            jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
+            jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
+            jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+            vdwjidx0E        = 2*vdwtype[jnrE+0];
+            vdwjidx0F        = 2*vdwtype[jnrF+0];
+            vdwjidx0G        = 2*vdwtype[jnrG+0];
+            vdwjidx0H        = 2*vdwtype[jnrH+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+            r00              = _mm256_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            vdwioffsetptr0+vdwjidx0E,
+                                            vdwioffsetptr0+vdwjidx0F,
+                                            vdwioffsetptr0+vdwjidx0G,
+                                            vdwioffsetptr0+vdwjidx0H,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_8real_swizzle_ps(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D,
+                                                                  vdwgridioffsetptr0+vdwjidx0E,
+                                                                  vdwgridioffsetptr0+vdwjidx0F,
+                                                                  vdwgridioffsetptr0+vdwjidx0G,
+                                                                  vdwgridioffsetptr0+vdwjidx0H);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_ps(_mm256_sub_ps(c6_00,_mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm256_sub_ps(_mm256_mul_ps(vvdw12,one_twelfth),_mm256_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,_mm256_sub_ps(vvdw6,_mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
+            fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
+            fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
+            fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
+            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
+
+            /* Inner loop uses 52 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*52);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_avx_256_single
+ * Electrostatics interaction: None
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_avx_256_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrE,jnrF,jnrG,jnrH;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
+    real             scratch[4*DIM];
+    __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    real *           vdwioffsetptr0;
+    real *           vdwgridioffsetptr0;
+    __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
+    __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
+    __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
+    __m256           c6grid_00;
+    real             *vdwgridparam;
+    __m256           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256           one_half  = _mm256_set1_ps(0.5);
+    __m256           minus_one = _mm256_set1_ps(-1.0);
+    __m256           dummy_mask,cutoff_mask;
+    __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
+    __m256           one     = _mm256_set1_ps(1.0);
+    __m256           two     = _mm256_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_ps(minus_one,_mm256_mul_ps(ewclj,ewclj));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+    j_coord_offsetE = 0;
+    j_coord_offsetF = 0;
+    j_coord_offsetG = 0;
+    j_coord_offsetH = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm256_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm256_setzero_ps();
+        fiy0             = _mm256_setzero_ps();
+        fiz0             = _mm256_setzero_ps();
+
+        /* Load parameters for i particles */
+        vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
+        vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            jnrE             = jjnr[jidx+4];
+            jnrF             = jjnr[jidx+5];
+            jnrG             = jjnr[jidx+6];
+            jnrH             = jjnr[jidx+7];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+            vdwjidx0E        = 2*vdwtype[jnrE+0];
+            vdwjidx0F        = 2*vdwtype[jnrF+0];
+            vdwjidx0G        = 2*vdwtype[jnrG+0];
+            vdwjidx0H        = 2*vdwtype[jnrH+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            vdwioffsetptr0+vdwjidx0E,
+                                            vdwioffsetptr0+vdwjidx0F,
+                                            vdwioffsetptr0+vdwjidx0G,
+                                            vdwioffsetptr0+vdwjidx0H,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_8real_swizzle_ps(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D,
+                                                                  vdwgridioffsetptr0+vdwjidx0E,
+                                                                  vdwgridioffsetptr0+vdwjidx0F,
+                                                                  vdwgridioffsetptr0+vdwjidx0G,
+                                                                  vdwgridioffsetptr0+vdwjidx0H);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),_mm256_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            fjptrE             = f+j_coord_offsetE;
+            fjptrF             = f+j_coord_offsetF;
+            fjptrG             = f+j_coord_offsetG;
+            fjptrH             = f+j_coord_offsetH;
+            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
+
+            /* Inner loop uses 46 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            jnrlistE         = jjnr[jidx+4];
+            jnrlistF         = jjnr[jidx+5];
+            jnrlistG         = jjnr[jidx+6];
+            jnrlistH         = jjnr[jidx+7];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
+                                            gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
+                                            
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
+            jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
+            jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
+            jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+            j_coord_offsetE  = DIM*jnrE;
+            j_coord_offsetF  = DIM*jnrF;
+            j_coord_offsetG  = DIM*jnrG;
+            j_coord_offsetH  = DIM*jnrH;
+
+            /* load j atom coordinates */
+            gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                                 x+j_coord_offsetC,x+j_coord_offsetD,
+                                                 x+j_coord_offsetE,x+j_coord_offsetF,
+                                                 x+j_coord_offsetG,x+j_coord_offsetH,
+                                                 &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm256_sub_ps(ix0,jx0);
+            dy00             = _mm256_sub_ps(iy0,jy0);
+            dz00             = _mm256_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm256_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+            vdwjidx0E        = 2*vdwtype[jnrE+0];
+            vdwjidx0F        = 2*vdwtype[jnrF+0];
+            vdwjidx0G        = 2*vdwtype[jnrG+0];
+            vdwjidx0H        = 2*vdwtype[jnrH+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm256_mul_ps(rsq00,rinv00);
+            r00              = _mm256_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
+                                            vdwioffsetptr0+vdwjidx0B,
+                                            vdwioffsetptr0+vdwjidx0C,
+                                            vdwioffsetptr0+vdwjidx0D,
+                                            vdwioffsetptr0+vdwjidx0E,
+                                            vdwioffsetptr0+vdwjidx0F,
+                                            vdwioffsetptr0+vdwjidx0G,
+                                            vdwioffsetptr0+vdwjidx0H,
+                                            &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm256_load_8real_swizzle_ps(vdwgridioffsetptr0+vdwjidx0A,
+                                                                  vdwgridioffsetptr0+vdwjidx0B,
+                                                                  vdwgridioffsetptr0+vdwjidx0C,
+                                                                  vdwgridioffsetptr0+vdwjidx0D,
+                                                                  vdwgridioffsetptr0+vdwjidx0E,
+                                                                  vdwgridioffsetptr0+vdwjidx0F,
+                                                                  vdwgridioffsetptr0+vdwjidx0G,
+                                                                  vdwgridioffsetptr0+vdwjidx0H);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),_mm256_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            fscal            = _mm256_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm256_mul_ps(fscal,dx00);
+            ty               = _mm256_mul_ps(fscal,dy00);
+            tz               = _mm256_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm256_add_ps(fix0,tx);
+            fiy0             = _mm256_add_ps(fiy0,ty);
+            fiz0             = _mm256_add_ps(fiz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
+            fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
+            fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
+            fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
+            gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
+
+            /* Inner loop uses 47 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm256_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                                 f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 6 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*6 + inneriter*47);
+}
index 0b25d2bae04f2746ae14242e2b1a97fd4e901ad2..e94a4d3e3335409a087c4ebfdd228ad1b88bf839 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
 
 #include "../nb_kernel.h"
 
+nb_kernel_t nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_avx_256_single;
+nb_kernel_t nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_avx_256_single;
+nb_kernel_t nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_avx_256_single;
+nb_kernel_t nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_avx_256_single;
 nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_avx_256_single;
 nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_avx_256_single;
 nb_kernel_t nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_avx_256_single;
@@ -48,6 +52,16 @@ nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_avx_256_single;
 nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_avx_256_single;
 nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_avx_256_single;
 nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_avx_256_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_avx_256_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_avx_256_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_avx_256_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_avx_256_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_avx_256_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_avx_256_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_avx_256_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_avx_256_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_avx_256_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_avx_256_single;
 nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_avx_256_single;
 nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_avx_256_single;
 nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_avx_256_single;
@@ -78,6 +92,16 @@ nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_avx_256_single;
 nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_avx_256_single;
 nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_avx_256_single;
 nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_avx_256_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_avx_256_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_avx_256_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_avx_256_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_avx_256_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_avx_256_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_avx_256_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_avx_256_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_avx_256_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_avx_256_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_avx_256_single;
 nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_avx_256_single;
 nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_avx_256_single;
 nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_avx_256_single;
@@ -259,6 +283,10 @@ nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_avx_256_single;
 nb_kernel_info_t
     kernellist_avx_256_single[] =
 {
+    { nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_avx_256_single, "nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_avx_256_single", "avx_256_single", "None", "None", "LJEwald", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_avx_256_single, "nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_avx_256_single", "avx_256_single", "None", "None", "LJEwald", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_avx_256_single, "nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_avx_256_single", "avx_256_single", "None", "None", "LJEwald", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_avx_256_single, "nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_avx_256_single", "avx_256_single", "None", "None", "LJEwald", "PotentialShift", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_avx_256_single, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_avx_256_single", "avx_256_single", "None", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_avx_256_single, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_avx_256_single", "avx_256_single", "None", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_avx_256_single, "nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_avx_256_single", "avx_256_single", "None", "None", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
@@ -267,6 +295,16 @@ nb_kernel_info_t
     { nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_avx_256_single, "nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_avx_256_single", "avx_256_single", "None", "None", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_avx_256_single, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_avx_256_single", "avx_256_single", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_avx_256_single, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_avx_256_single", "avx_256_single", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_avx_256_single, "nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_avx_256_single", "avx_256_single", "Ewald", "None", "LJEwald", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_avx_256_single, "nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_avx_256_single", "avx_256_single", "Ewald", "None", "LJEwald", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_avx_256_single, "nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_avx_256_single", "avx_256_single", "Ewald", "None", "LJEwald", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_avx_256_single, "nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_avx_256_single", "avx_256_single", "Ewald", "None", "LJEwald", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_avx_256_single, "nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_avx_256_single", "avx_256_single", "Ewald", "None", "LJEwald", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_avx_256_single, "nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_avx_256_single", "avx_256_single", "Ewald", "None", "LJEwald", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_avx_256_single, "nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_avx_256_single", "avx_256_single", "Ewald", "None", "LJEwald", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_avx_256_single, "nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_avx_256_single", "avx_256_single", "Ewald", "None", "LJEwald", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_avx_256_single, "nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_avx_256_single", "avx_256_single", "Ewald", "None", "LJEwald", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_avx_256_single, "nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_avx_256_single", "avx_256_single", "Ewald", "None", "LJEwald", "None", "Water4Water4", "", "Force" },
     { nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_avx_256_single, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_avx_256_single", "avx_256_single", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_avx_256_single, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_avx_256_single", "avx_256_single", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_avx_256_single, "nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_avx_256_single", "avx_256_single", "Ewald", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
@@ -297,6 +335,16 @@ nb_kernel_info_t
     { nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_avx_256_single, "nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_avx_256_single", "avx_256_single", "Ewald", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
     { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_avx_256_single, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_avx_256_single", "avx_256_single", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
     { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_avx_256_single, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_avx_256_single", "avx_256_single", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_avx_256_single, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_avx_256_single", "avx_256_single", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_avx_256_single, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_avx_256_single", "avx_256_single", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_avx_256_single, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_avx_256_single", "avx_256_single", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_avx_256_single, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_avx_256_single", "avx_256_single", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_avx_256_single, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_avx_256_single", "avx_256_single", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_avx_256_single, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_avx_256_single", "avx_256_single", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_avx_256_single, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_avx_256_single", "avx_256_single", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_avx_256_single, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_avx_256_single", "avx_256_single", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_avx_256_single, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_avx_256_single", "avx_256_single", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_avx_256_single, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_avx_256_single", "avx_256_single", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water4Water4", "", "Force" },
     { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_avx_256_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_avx_256_single", "avx_256_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_avx_256_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_avx_256_single", "avx_256_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_avx_256_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_avx_256_single", "avx_256_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
index 14af0f442363e11f12519c1f0303036746c8b181..77e9227ff7397b243592d12d2e869ed8a95d4db5 100644 (file)
@@ -2,7 +2,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -84,6 +84,9 @@
 
 /* ## Calculate the size and offset for (merged/interleaved) table data */
 
+
+
+
 /*
  * Gromacs nonbonded kernel:   {KERNEL_NAME}
  * Electrostatics interaction: {KERNEL_ELEC}
@@ -124,6 +127,9 @@ void
     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
     /* #for I in PARTICLES_I */
     real *           vdwioffsetptr{I};
+    /* #if 'LJEwald' in KERNEL_VDW */
+    real *           vdwgridioffsetptr{I};
+    /* #endif                      */
     __m256           ix{I},iy{I},iz{I},fix{I},fiy{I},fiz{I},iq{I},isai{I};
     /* #endfor */
     /* #for J in PARTICLES_J */
@@ -159,6 +165,15 @@ void
     __m256           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
     real             *vftab;
     /* #endif */
+    /* #if 'LJEwald' in KERNEL_VDW */
+    /* #for I,J in PAIRS_IJ */
+    __m256           c6grid_{I}{J};
+    /* #endfor */
+    real             *vdwgridparam;
+    __m256           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    __m256           one_half  = _mm256_set1_ps(0.5);
+    __m256           minus_one = _mm256_set1_ps(-1.0);
+    /* #endif */
     /* #if 'Ewald' in KERNEL_ELEC */
     __m256i          ewitab;
     __m128i          ewitab_lo,ewitab_hi;
@@ -199,6 +214,12 @@ void
     vdwparam         = fr->nbfp;
     vdwtype          = mdatoms->typeA;
     /* #endif */
+    /* #if 'LJEwald' in KERNEL_VDW */
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm256_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm256_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm256_mul_ps(minus_one,_mm256_mul_ps(ewclj,ewclj));
+    /* #endif */
 
     /* #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW */
     vftab            = kernel_data->table_elec_vdw->data;
@@ -244,6 +265,9 @@ void
     /*     #endfor */
     /*     #for I in PARTICLES_VDW_I */
     vdwioffsetptr{I}   = vdwparam+2*nvdwtype*vdwtype[inr+{I}];
+    /*         #if 'LJEwald' in KERNEL_VDW */
+    vdwgridioffsetptr{I} = vdwgridparam+2*nvdwtype*vdwtype[inr+{I}];
+    /*         #endif */
     /*     #endfor */
     /* #endif */
 
@@ -259,8 +283,14 @@ void
     qq{I}{J}             = _mm256_mul_ps(iq{I},jq{J});
     /*         #endif */
     /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
+        /*         #if 'LJEwald' in KERNEL_VDW */
+    c6_{I}{J}            = _mm256_set1_ps(vdwioffsetptr{I}[vdwjidx{J}A]);
+    c12_{I}{J}           = _mm256_set1_ps(vdwioffsetptr{I}[vdwjidx{J}A+1]);
+    c6grid_{I}{J}        = _mm256_set1_ps(vdwgridioffsetptr{I}[vdwjidx{J}A]);
+        /*         #else */
     c6_{I}{J}            = _mm256_set1_ps(vdwioffsetptr{I}[vdwjidx{J}A]);
     c12_{I}{J}           = _mm256_set1_ps(vdwioffsetptr{I}[vdwjidx{J}A+1]);
+        /*         #endif */
     /*         #endif */
     /*     #endfor */
     /* #endif */
@@ -373,6 +403,9 @@ void
         /*     #endfor */
         /*     #for I in PARTICLES_VDW_I */
         vdwioffsetptr{I}   = vdwparam+2*nvdwtype*vdwtype[inr+{I}];
+        /*         #if 'LJEwald' in KERNEL_VDW */
+        vdwgridioffsetptr{I} = vdwgridparam+2*nvdwtype*vdwtype[inr+{I}];
+        /*         #endif */
         /*     #endfor */
         /* #endif */
 
@@ -593,6 +626,17 @@ void
                                             vdwioffsetptr{I}+vdwjidx{J}G,
                                             vdwioffsetptr{I}+vdwjidx{J}H,
                                             &c6_{I}{J},&c12_{I}{J});
+
+            /*         #if 'LJEwald' in KERNEL_VDW */
+            c6grid_{I}{J}       = gmx_mm256_load_8real_swizzle_ps(vdwgridioffsetptr{I}+vdwjidx{J}A,
+                                                                  vdwgridioffsetptr{I}+vdwjidx{J}B,
+                                                                  vdwgridioffsetptr{I}+vdwjidx{J}C,
+                                                                  vdwgridioffsetptr{I}+vdwjidx{J}D,
+                                                                  vdwgridioffsetptr{I}+vdwjidx{J}E,
+                                                                  vdwgridioffsetptr{I}+vdwjidx{J}F,
+                                                                  vdwgridioffsetptr{I}+vdwjidx{J}G,
+                                                                  vdwgridioffsetptr{I}+vdwjidx{J}H);
+            /*          #endif */
             /*         #endif */
             /*     #endif */
 
@@ -605,7 +649,7 @@ void
             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
-            /*         #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW     */
+            /*         #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW */
             /*             ## 3 tables, 4 bytes per point: multiply index by 12 */
             vfitab_lo        = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
             vfitab_hi        = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
@@ -613,7 +657,7 @@ void
             /*             ## 1 table, 4 bytes per point: multiply index by 4   */
             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
-            /*         #elif 'Table' in KERNEL_VDW                              */
+            /*         #elif 'Table' in KERNEL_VDW */
             /*             ## 2 tables, 4 bytes per point: multiply index by 8  */
             vfitab_lo        = _mm_slli_epi32(vfitab_lo,3);
             vfitab_hi        = _mm_slli_epi32(vfitab_hi,3);
@@ -867,7 +911,46 @@ void
             fvdw             = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv{I}{J})));
             /*                 #define INNERFLOPS INNERFLOPS+4 */
             /*             #endif */
-            /*         #endif */
+
+            /*         #elif KERNEL_VDW=='LJEwald' */
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq{I}{J},rinvsq{I}{J}),rinvsq{I}{J});
+            ewcljrsq         = _mm256_mul_ps(ewclj2,rsq{I}{J});
+            ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /*                 #define INNERFLOPS INNERFLOPS+11 */
+            /*             #if 'Potential' in KERNEL_VF or KERNEL_MOD_VDW=='PotentialSwitch' */
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm256_mul_ps(_mm256_sub_ps(c6_{I}{J},_mm256_mul_ps(c6grid_{I}{J},_mm256_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm256_mul_ps(c12_{I}{J},_mm256_mul_ps(rinvsix,rinvsix));
+           /*                 #define INNERFLOPS INNERFLOPS+6 */
+           /*                 #if KERNEL_MOD_VDW=='PotentialShift' */
+           vvdw             = _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12 , _mm256_mul_ps(c12_{I}{J},_mm256_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm256_mul_ps( _mm256_sub_ps(vvdw6,_mm256_add_ps(_mm256_mul_ps(c6_{I}{J},sh_vdw_invrcut6),_mm256_mul_ps(c6grid_{I}{J},sh_lj_ewald))),one_sixth));
+            /*                     #define INNERFLOPS INNERFLOPS+10 */
+           /*                 #else */
+            vvdw             = _mm256_sub_ps(_mm256_mul_ps(vvdw12,one_twelfth),_mm256_mul_ps(vvdw6,one_sixth));
+            /*                 #define INNERFLOPS INNERFLOPS+3 */
+           /*                 #endif */
+            /*                  ## Check for force inside potential check, i.e. this means we already did the potential part */
+            /*                  #if 'Force' in KERNEL_VF */
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,_mm256_sub_ps(vvdw6,_mm256_mul_ps(_mm256_mul_ps(c6grid_{I}{J},one_sixth),_mm256_mul_ps(exponent,ewclj6)))),rinvsq{I}{J});
+            /*                 #define INNERFLOPS INNERFLOPS+6 */
+            /*                  #endif */
+            /*              #elif KERNEL_VF=='Force' */
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm256_mul_ps(c6grid_{I}{J},_mm256_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm256_mul_ps(_mm256_mul_ps(c6grid_{I}{J},one_sixth),_mm256_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_{I}{J},rinvsix),_mm256_sub_ps(c6_{I}{J},f6A)),rinvsix),f6B),rinvsq{I}{J});
+            /*                 #define INNERFLOPS INNERFLOPS+11 */
+            /*              #endif */
+           /*         #endif */
             /*         ## End of check for vdw interaction forms */
             /*     #endif */
             /*     ## END OF VDW INTERACTION CHECK FOR PAIR I-J */
index 41cfb1e620c870d6a0f67faeec3a9eb0b9380b5a..295fc899e662b8aa18358a05231b45120df96116 100755 (executable)
@@ -2,7 +2,7 @@
 #
 # This file is part of the GROMACS molecular simulation package.
 #
-# Copyright (c) 2012,2013, by the GROMACS development team, led by
+# Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
 # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 # and including many others, as listed in the AUTHORS file in the
 # top-level source directory and at http://www.gromacs.org.
@@ -120,6 +120,7 @@ VdwList = {
     'LennardJones'            : ['rinvsq'],
     'Buckingham'              : ['rinv','rinvsq','r'],
     'CubicSplineTable'        : ['rinv','r','table'],
+    'LJEwald'                 : ['rinv','rinvsq','r'],
 }
 
 
@@ -193,6 +194,7 @@ Abbreviation = {
     'CubicSplineTable'        : 'CSTab',
     'LennardJones'            : 'LJ',
     'Buckingham'              : 'Bham',
+    'LJEwald'                 : 'LJEw',
     'PotentialShift'          : 'Sh',
     'PotentialSwitch'         : 'Sw',
     'ExactCutoff'             : 'Cut',
@@ -282,7 +284,11 @@ def KeepKernel(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,Kernel
         return 0
     # For Vdw, we support switch and shift for Lennard-Jones/Buckingham
     if((KernelVdwMod=='ExactCutoff') or
-       (KernelVdwMod in ['PotentialShift','PotentialSwitch'] and KernelVdw not in ['LennardJones','Buckingham'])):
+       (KernelVdwMod in ['PotentialShift','PotentialSwitch'] and KernelVdw not in ['LennardJones','Buckingham','LJEwald'])):
+        return 0
+
+    # For LJEwald, we only support shift
+    if(KernelVdw=='LJEwald' and KernelVdwMod=='PotentialSwitch'):
         return 0
 
     # Choose either switch or shift and don't mix them...
@@ -302,6 +308,10 @@ def KeepKernel(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,Kernel
         elif(KernelElec!='ReactionField'):
             return 0
 
+    #Only do LJ-PME if we are also doing PME for electrostatics, or no electrostatics at all.
+    if(KernelVdw=='LJEwald' and KernelElec not in ['Ewald','None']):
+        return 0
+
     return 1
 
 
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_c.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_c.c
new file mode 100644 (file)
index 0000000..9c5f5fe
--- /dev/null
@@ -0,0 +1,486 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS c kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_c
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_c
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    int              i_shift_offset,i_coord_offset,j_coord_offset;
+    int              j_index_start,j_index_end;
+    int              nri,inr,ggid,iidx,jidx,jnr,outeriter,inneriter;
+    real             shX,shY,shZ,tx,ty,tz,fscal,rcutoff,rcutoff2;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             *shiftvec,*fshift,*x,*f;
+    int              vdwioffset0;
+    real             ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0;
+    real             jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    real             dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00,cexp1_00,cexp2_00;
+    real             velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    real             rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,br,vvdwexp,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    real             c6grid_00;
+    real             ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,sh_lj_ewald;
+    real            *vdwgridparam;
+    int              ewitab;
+    real             ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace;
+    real             *ewtab;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = fr->epsfac;
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    ewclj            = fr->ewaldcoeff_lj;
+    sh_lj_ewald             = fr->ic->sh_lj_ewald;
+    ewclj2           = ewclj*ewclj;
+    ewclj6           = ewclj2*ewclj2*ewclj2;
+
+    sh_ewald         = fr->ic->sh_ewald;
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = fr->ic->tabq_scale;
+    ewtabhalfspace   = 0.5/ewtabscale;
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff          = fr->rcoulomb;
+    rcutoff2         = rcutoff*rcutoff;
+
+    sh_vdw_invrcut6  = fr->ic->sh_invrc6;
+    rvdw             = fr->rvdw;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = shX + x[i_coord_offset+DIM*0+XX];
+        iy0              = shY + x[i_coord_offset+DIM*0+YY];
+        iz0              = shZ + x[i_coord_offset+DIM*0+ZZ];
+
+        fix0             = 0.0;
+        fiy0             = 0.0;
+        fiz0             = 0.0;
+
+        /* Load parameters for i particles */
+        iq0              = facel*charge[inr+0];
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = 0.0;
+        vvdwsum          = 0.0;
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end; jidx++)
+        {
+            /* Get j neighbor index, and coordinate index */
+            jnr              = jjnr[jidx];
+            j_coord_offset   = DIM*jnr;
+
+            /* load j atom coordinates */
+            jx0              = x[j_coord_offset+DIM*0+XX];
+            jy0              = x[j_coord_offset+DIM*0+YY];
+            jz0              = x[j_coord_offset+DIM*0+ZZ];
+
+            /* Calculate displacement vector */
+            dx00             = ix0 - jx0;
+            dy00             = iy0 - jy0;
+            dz00             = iz0 - jz0;
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = dx00*dx00+dy00*dy00+dz00*dz00;
+
+            rinv00           = gmx_invsqrt(rsq00);
+
+            rinvsq00         = rinv00*rinv00;
+
+            /* Load parameters for j particles */
+            jq0              = charge[jnr+0];
+            vdwjidx0         = 2*vdwtype[jnr+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq00<rcutoff2)
+            {
+
+            r00              = rsq00*rinv00;
+
+            qq00             = iq0*jq0;
+            c6_00            = vdwparam[vdwioffset0+vdwjidx0];
+            c12_00           = vdwparam[vdwioffset0+vdwjidx0+1];
+            c6grid_00        = vdwgridparam[vdwioffset0+vdwjidx0];
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r00*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq00*((rinv00-sh_ewald)-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq00*rinv00*(rinvsq00-felec);
+
+            rinvsix          = rinvsq00*rinvsq00*rinvsq00;
+            ewcljrsq         = ewclj2*rsq00;
+            exponent        = exp(-ewcljrsq);
+            poly             = exponent*(1.0 + ewcljrsq + ewcljrsq*ewcljrsq*0.5);
+            vvdw6            = (c6_00-c6grid_00*(1.0-poly))*rinvsix;
+            vvdw12           = c12_00*rinvsix*rinvsix;
+            vvdw             = (vvdw12 - c12_00*sh_vdw_invrcut6*sh_vdw_invrcut6)*(1.0/12.0) - (vvdw6 - c6_00*sh_vdw_invrcut6 - c6grid_00*sh_lj_ewald)*(1.0/6.0);
+            fvdw             = (vvdw12 - vvdw6 - c6grid_00*(1.0/6.0)*exponent*ewclj6)*rinvsq00;
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+            vvdwsum         += vvdw;
+
+            fscal            = felec+fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx00;
+            ty               = fscal*dy00;
+            tz               = fscal*dz00;
+
+            /* Update vectorial force */
+            fix0            += tx;
+            fiy0            += ty;
+            fiz0            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            }
+
+            /* Inner loop uses 74 flops */
+        }
+        /* End of innermost loop */
+
+        tx = ty = tz = 0;
+        f[i_coord_offset+DIM*0+XX] += fix0;
+        f[i_coord_offset+DIM*0+YY] += fiy0;
+        f[i_coord_offset+DIM*0+ZZ] += fiz0;
+        tx                         += fix0;
+        ty                         += fiy0;
+        tz                         += fiz0;
+        fshift[i_shift_offset+XX]  += tx;
+        fshift[i_shift_offset+YY]  += ty;
+        fshift[i_shift_offset+ZZ]  += tz;
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        kernel_data->energygrp_elec[ggid] += velecsum;
+        kernel_data->energygrp_vdw[ggid] += vvdwsum;
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 15 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*15 + inneriter*74);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_c
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_c
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    int              i_shift_offset,i_coord_offset,j_coord_offset;
+    int              j_index_start,j_index_end;
+    int              nri,inr,ggid,iidx,jidx,jnr,outeriter,inneriter;
+    real             shX,shY,shZ,tx,ty,tz,fscal,rcutoff,rcutoff2;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             *shiftvec,*fshift,*x,*f;
+    int              vdwioffset0;
+    real             ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0;
+    real             jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    real             dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00,cexp1_00,cexp2_00;
+    real             velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    real             rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,br,vvdwexp,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    real             c6grid_00;
+    real             ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,sh_lj_ewald;
+    real            *vdwgridparam;
+    int              ewitab;
+    real             ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace;
+    real             *ewtab;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = fr->epsfac;
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    ewclj            = fr->ewaldcoeff_lj;
+    sh_lj_ewald             = fr->ic->sh_lj_ewald;
+    ewclj2           = ewclj*ewclj;
+    ewclj6           = ewclj2*ewclj2*ewclj2;
+
+    sh_ewald         = fr->ic->sh_ewald;
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = fr->ic->tabq_scale;
+    ewtabhalfspace   = 0.5/ewtabscale;
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff          = fr->rcoulomb;
+    rcutoff2         = rcutoff*rcutoff;
+
+    sh_vdw_invrcut6  = fr->ic->sh_invrc6;
+    rvdw             = fr->rvdw;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = shX + x[i_coord_offset+DIM*0+XX];
+        iy0              = shY + x[i_coord_offset+DIM*0+YY];
+        iz0              = shZ + x[i_coord_offset+DIM*0+ZZ];
+
+        fix0             = 0.0;
+        fiy0             = 0.0;
+        fiz0             = 0.0;
+
+        /* Load parameters for i particles */
+        iq0              = facel*charge[inr+0];
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end; jidx++)
+        {
+            /* Get j neighbor index, and coordinate index */
+            jnr              = jjnr[jidx];
+            j_coord_offset   = DIM*jnr;
+
+            /* load j atom coordinates */
+            jx0              = x[j_coord_offset+DIM*0+XX];
+            jy0              = x[j_coord_offset+DIM*0+YY];
+            jz0              = x[j_coord_offset+DIM*0+ZZ];
+
+            /* Calculate displacement vector */
+            dx00             = ix0 - jx0;
+            dy00             = iy0 - jy0;
+            dz00             = iz0 - jz0;
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = dx00*dx00+dy00*dy00+dz00*dz00;
+
+            rinv00           = gmx_invsqrt(rsq00);
+
+            rinvsq00         = rinv00*rinv00;
+
+            /* Load parameters for j particles */
+            jq0              = charge[jnr+0];
+            vdwjidx0         = 2*vdwtype[jnr+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq00<rcutoff2)
+            {
+
+            r00              = rsq00*rinv00;
+
+            qq00             = iq0*jq0;
+            c6_00            = vdwparam[vdwioffset0+vdwjidx0];
+            c12_00           = vdwparam[vdwioffset0+vdwjidx0+1];
+            c6grid_00        = vdwgridparam[vdwioffset0+vdwjidx0];
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r00*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq00*rinv00*(rinvsq00-felec);
+
+            rinvsix          = rinvsq00*rinvsq00*rinvsq00;
+            ewcljrsq         = ewclj2*rsq00;
+            exponent        = exp(-ewcljrsq);
+            poly             = exponent*(1.0 + ewcljrsq + ewcljrsq*ewcljrsq*0.5);
+            fvdw             = (((c12_00*rinvsix - c6_00 + c6grid_00*(1.0-poly))*rinvsix) - c6grid_00*(1.0/6.0)*exponent*ewclj6)*rinvsq00;
+
+            fscal            = felec+fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx00;
+            ty               = fscal*dy00;
+            tz               = fscal*dz00;
+
+            /* Update vectorial force */
+            fix0            += tx;
+            fiy0            += ty;
+            fiz0            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            }
+
+            /* Inner loop uses 55 flops */
+        }
+        /* End of innermost loop */
+
+        tx = ty = tz = 0;
+        f[i_coord_offset+DIM*0+XX] += fix0;
+        f[i_coord_offset+DIM*0+YY] += fiy0;
+        f[i_coord_offset+DIM*0+ZZ] += fiz0;
+        tx                         += fix0;
+        ty                         += fiy0;
+        tz                         += fiz0;
+        fshift[i_shift_offset+XX]  += tx;
+        fshift[i_shift_offset+YY]  += ty;
+        fshift[i_shift_offset+ZZ]  += tz;
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 13 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*13 + inneriter*55);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_c.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_c.c
new file mode 100644 (file)
index 0000000..51665bf
--- /dev/null
@@ -0,0 +1,738 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS c kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_c
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_c
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    int              i_shift_offset,i_coord_offset,j_coord_offset;
+    int              j_index_start,j_index_end;
+    int              nri,inr,ggid,iidx,jidx,jnr,outeriter,inneriter;
+    real             shX,shY,shZ,tx,ty,tz,fscal,rcutoff,rcutoff2;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             *shiftvec,*fshift,*x,*f;
+    int              vdwioffset0;
+    real             ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    real             ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    real             ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0;
+    real             jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    real             dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00,cexp1_00,cexp2_00;
+    real             dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10,cexp1_10,cexp2_10;
+    real             dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20,cexp1_20,cexp2_20;
+    real             velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    real             rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,br,vvdwexp,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    real             c6grid_00;
+    real             c6grid_10;
+    real             c6grid_20;
+    real             ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,sh_lj_ewald;
+    real            *vdwgridparam;
+    int              ewitab;
+    real             ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace;
+    real             *ewtab;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = fr->epsfac;
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    ewclj            = fr->ewaldcoeff_lj;
+    sh_lj_ewald             = fr->ic->sh_lj_ewald;
+    ewclj2           = ewclj*ewclj;
+    ewclj6           = ewclj2*ewclj2*ewclj2;
+
+    sh_ewald         = fr->ic->sh_ewald;
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = fr->ic->tabq_scale;
+    ewtabhalfspace   = 0.5/ewtabscale;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = facel*charge[inr+0];
+    iq1              = facel*charge[inr+1];
+    iq2              = facel*charge[inr+2];
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff          = fr->rcoulomb;
+    rcutoff2         = rcutoff*rcutoff;
+
+    sh_vdw_invrcut6  = fr->ic->sh_invrc6;
+    rvdw             = fr->rvdw;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = shX + x[i_coord_offset+DIM*0+XX];
+        iy0              = shY + x[i_coord_offset+DIM*0+YY];
+        iz0              = shZ + x[i_coord_offset+DIM*0+ZZ];
+        ix1              = shX + x[i_coord_offset+DIM*1+XX];
+        iy1              = shY + x[i_coord_offset+DIM*1+YY];
+        iz1              = shZ + x[i_coord_offset+DIM*1+ZZ];
+        ix2              = shX + x[i_coord_offset+DIM*2+XX];
+        iy2              = shY + x[i_coord_offset+DIM*2+YY];
+        iz2              = shZ + x[i_coord_offset+DIM*2+ZZ];
+
+        fix0             = 0.0;
+        fiy0             = 0.0;
+        fiz0             = 0.0;
+        fix1             = 0.0;
+        fiy1             = 0.0;
+        fiz1             = 0.0;
+        fix2             = 0.0;
+        fiy2             = 0.0;
+        fiz2             = 0.0;
+
+        /* Reset potential sums */
+        velecsum         = 0.0;
+        vvdwsum          = 0.0;
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end; jidx++)
+        {
+            /* Get j neighbor index, and coordinate index */
+            jnr              = jjnr[jidx];
+            j_coord_offset   = DIM*jnr;
+
+            /* load j atom coordinates */
+            jx0              = x[j_coord_offset+DIM*0+XX];
+            jy0              = x[j_coord_offset+DIM*0+YY];
+            jz0              = x[j_coord_offset+DIM*0+ZZ];
+
+            /* Calculate displacement vector */
+            dx00             = ix0 - jx0;
+            dy00             = iy0 - jy0;
+            dz00             = iz0 - jz0;
+            dx10             = ix1 - jx0;
+            dy10             = iy1 - jy0;
+            dz10             = iz1 - jz0;
+            dx20             = ix2 - jx0;
+            dy20             = iy2 - jy0;
+            dz20             = iz2 - jz0;
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = dx00*dx00+dy00*dy00+dz00*dz00;
+            rsq10            = dx10*dx10+dy10*dy10+dz10*dz10;
+            rsq20            = dx20*dx20+dy20*dy20+dz20*dz20;
+
+            rinv00           = gmx_invsqrt(rsq00);
+            rinv10           = gmx_invsqrt(rsq10);
+            rinv20           = gmx_invsqrt(rsq20);
+
+            rinvsq00         = rinv00*rinv00;
+            rinvsq10         = rinv10*rinv10;
+            rinvsq20         = rinv20*rinv20;
+
+            /* Load parameters for j particles */
+            jq0              = charge[jnr+0];
+            vdwjidx0         = 2*vdwtype[jnr+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq00<rcutoff2)
+            {
+
+            r00              = rsq00*rinv00;
+
+            qq00             = iq0*jq0;
+            c6_00            = vdwparam[vdwioffset0+vdwjidx0];
+            c12_00           = vdwparam[vdwioffset0+vdwjidx0+1];
+            c6grid_00        = vdwgridparam[vdwioffset0+vdwjidx0];
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r00*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq00*((rinv00-sh_ewald)-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq00*rinv00*(rinvsq00-felec);
+
+            rinvsix          = rinvsq00*rinvsq00*rinvsq00;
+            ewcljrsq         = ewclj2*rsq00;
+            exponent        = exp(-ewcljrsq);
+            poly             = exponent*(1.0 + ewcljrsq + ewcljrsq*ewcljrsq*0.5);
+            vvdw6            = (c6_00-c6grid_00*(1.0-poly))*rinvsix;
+            vvdw12           = c12_00*rinvsix*rinvsix;
+            vvdw             = (vvdw12 - c12_00*sh_vdw_invrcut6*sh_vdw_invrcut6)*(1.0/12.0) - (vvdw6 - c6_00*sh_vdw_invrcut6 - c6grid_00*sh_lj_ewald)*(1.0/6.0);
+            fvdw             = (vvdw12 - vvdw6 - c6grid_00*(1.0/6.0)*exponent*ewclj6)*rinvsq00;
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+            vvdwsum         += vvdw;
+
+            fscal            = felec+fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx00;
+            ty               = fscal*dy00;
+            tz               = fscal*dz00;
+
+            /* Update vectorial force */
+            fix0            += tx;
+            fiy0            += ty;
+            fiz0            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq10<rcutoff2)
+            {
+
+            r10              = rsq10*rinv10;
+
+            qq10             = iq1*jq0;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r10*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq10*((rinv10-sh_ewald)-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq10*rinv10*(rinvsq10-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx10;
+            ty               = fscal*dy10;
+            tz               = fscal*dz10;
+
+            /* Update vectorial force */
+            fix1            += tx;
+            fiy1            += ty;
+            fiz1            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq20<rcutoff2)
+            {
+
+            r20              = rsq20*rinv20;
+
+            qq20             = iq2*jq0;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r20*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq20*((rinv20-sh_ewald)-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq20*rinv20*(rinvsq20-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx20;
+            ty               = fscal*dy20;
+            tz               = fscal*dz20;
+
+            /* Update vectorial force */
+            fix2            += tx;
+            fiy2            += ty;
+            fiz2            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            }
+
+            /* Inner loop uses 158 flops */
+        }
+        /* End of innermost loop */
+
+        tx = ty = tz = 0;
+        f[i_coord_offset+DIM*0+XX] += fix0;
+        f[i_coord_offset+DIM*0+YY] += fiy0;
+        f[i_coord_offset+DIM*0+ZZ] += fiz0;
+        tx                         += fix0;
+        ty                         += fiy0;
+        tz                         += fiz0;
+        f[i_coord_offset+DIM*1+XX] += fix1;
+        f[i_coord_offset+DIM*1+YY] += fiy1;
+        f[i_coord_offset+DIM*1+ZZ] += fiz1;
+        tx                         += fix1;
+        ty                         += fiy1;
+        tz                         += fiz1;
+        f[i_coord_offset+DIM*2+XX] += fix2;
+        f[i_coord_offset+DIM*2+YY] += fiy2;
+        f[i_coord_offset+DIM*2+ZZ] += fiz2;
+        tx                         += fix2;
+        ty                         += fiy2;
+        tz                         += fiz2;
+        fshift[i_shift_offset+XX]  += tx;
+        fshift[i_shift_offset+YY]  += ty;
+        fshift[i_shift_offset+ZZ]  += tz;
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        kernel_data->energygrp_elec[ggid] += velecsum;
+        kernel_data->energygrp_vdw[ggid] += vvdwsum;
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 32 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*32 + inneriter*158);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_c
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_c
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    int              i_shift_offset,i_coord_offset,j_coord_offset;
+    int              j_index_start,j_index_end;
+    int              nri,inr,ggid,iidx,jidx,jnr,outeriter,inneriter;
+    real             shX,shY,shZ,tx,ty,tz,fscal,rcutoff,rcutoff2;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             *shiftvec,*fshift,*x,*f;
+    int              vdwioffset0;
+    real             ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    real             ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    real             ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0;
+    real             jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    real             dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00,cexp1_00,cexp2_00;
+    real             dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10,cexp1_10,cexp2_10;
+    real             dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20,cexp1_20,cexp2_20;
+    real             velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    real             rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,br,vvdwexp,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    real             c6grid_00;
+    real             c6grid_10;
+    real             c6grid_20;
+    real             ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,sh_lj_ewald;
+    real            *vdwgridparam;
+    int              ewitab;
+    real             ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace;
+    real             *ewtab;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = fr->epsfac;
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    ewclj            = fr->ewaldcoeff_lj;
+    sh_lj_ewald             = fr->ic->sh_lj_ewald;
+    ewclj2           = ewclj*ewclj;
+    ewclj6           = ewclj2*ewclj2*ewclj2;
+
+    sh_ewald         = fr->ic->sh_ewald;
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = fr->ic->tabq_scale;
+    ewtabhalfspace   = 0.5/ewtabscale;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = facel*charge[inr+0];
+    iq1              = facel*charge[inr+1];
+    iq2              = facel*charge[inr+2];
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff          = fr->rcoulomb;
+    rcutoff2         = rcutoff*rcutoff;
+
+    sh_vdw_invrcut6  = fr->ic->sh_invrc6;
+    rvdw             = fr->rvdw;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = shX + x[i_coord_offset+DIM*0+XX];
+        iy0              = shY + x[i_coord_offset+DIM*0+YY];
+        iz0              = shZ + x[i_coord_offset+DIM*0+ZZ];
+        ix1              = shX + x[i_coord_offset+DIM*1+XX];
+        iy1              = shY + x[i_coord_offset+DIM*1+YY];
+        iz1              = shZ + x[i_coord_offset+DIM*1+ZZ];
+        ix2              = shX + x[i_coord_offset+DIM*2+XX];
+        iy2              = shY + x[i_coord_offset+DIM*2+YY];
+        iz2              = shZ + x[i_coord_offset+DIM*2+ZZ];
+
+        fix0             = 0.0;
+        fiy0             = 0.0;
+        fiz0             = 0.0;
+        fix1             = 0.0;
+        fiy1             = 0.0;
+        fiz1             = 0.0;
+        fix2             = 0.0;
+        fiy2             = 0.0;
+        fiz2             = 0.0;
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end; jidx++)
+        {
+            /* Get j neighbor index, and coordinate index */
+            jnr              = jjnr[jidx];
+            j_coord_offset   = DIM*jnr;
+
+            /* load j atom coordinates */
+            jx0              = x[j_coord_offset+DIM*0+XX];
+            jy0              = x[j_coord_offset+DIM*0+YY];
+            jz0              = x[j_coord_offset+DIM*0+ZZ];
+
+            /* Calculate displacement vector */
+            dx00             = ix0 - jx0;
+            dy00             = iy0 - jy0;
+            dz00             = iz0 - jz0;
+            dx10             = ix1 - jx0;
+            dy10             = iy1 - jy0;
+            dz10             = iz1 - jz0;
+            dx20             = ix2 - jx0;
+            dy20             = iy2 - jy0;
+            dz20             = iz2 - jz0;
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = dx00*dx00+dy00*dy00+dz00*dz00;
+            rsq10            = dx10*dx10+dy10*dy10+dz10*dz10;
+            rsq20            = dx20*dx20+dy20*dy20+dz20*dz20;
+
+            rinv00           = gmx_invsqrt(rsq00);
+            rinv10           = gmx_invsqrt(rsq10);
+            rinv20           = gmx_invsqrt(rsq20);
+
+            rinvsq00         = rinv00*rinv00;
+            rinvsq10         = rinv10*rinv10;
+            rinvsq20         = rinv20*rinv20;
+
+            /* Load parameters for j particles */
+            jq0              = charge[jnr+0];
+            vdwjidx0         = 2*vdwtype[jnr+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq00<rcutoff2)
+            {
+
+            r00              = rsq00*rinv00;
+
+            qq00             = iq0*jq0;
+            c6_00            = vdwparam[vdwioffset0+vdwjidx0];
+            c12_00           = vdwparam[vdwioffset0+vdwjidx0+1];
+            c6grid_00        = vdwgridparam[vdwioffset0+vdwjidx0];
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r00*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq00*rinv00*(rinvsq00-felec);
+
+            rinvsix          = rinvsq00*rinvsq00*rinvsq00;
+            ewcljrsq         = ewclj2*rsq00;
+            exponent        = exp(-ewcljrsq);
+            poly             = exponent*(1.0 + ewcljrsq + ewcljrsq*ewcljrsq*0.5);
+            fvdw             = (((c12_00*rinvsix - c6_00 + c6grid_00*(1.0-poly))*rinvsix) - c6grid_00*(1.0/6.0)*exponent*ewclj6)*rinvsq00;
+
+            fscal            = felec+fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx00;
+            ty               = fscal*dy00;
+            tz               = fscal*dz00;
+
+            /* Update vectorial force */
+            fix0            += tx;
+            fiy0            += ty;
+            fiz0            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq10<rcutoff2)
+            {
+
+            r10              = rsq10*rinv10;
+
+            qq10             = iq1*jq0;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r10*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq10*rinv10*(rinvsq10-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx10;
+            ty               = fscal*dy10;
+            tz               = fscal*dz10;
+
+            /* Update vectorial force */
+            fix1            += tx;
+            fiy1            += ty;
+            fiz1            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq20<rcutoff2)
+            {
+
+            r20              = rsq20*rinv20;
+
+            qq20             = iq2*jq0;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r20*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq20*rinv20*(rinvsq20-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx20;
+            ty               = fscal*dy20;
+            tz               = fscal*dz20;
+
+            /* Update vectorial force */
+            fix2            += tx;
+            fiy2            += ty;
+            fiz2            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            }
+
+            /* Inner loop uses 123 flops */
+        }
+        /* End of innermost loop */
+
+        tx = ty = tz = 0;
+        f[i_coord_offset+DIM*0+XX] += fix0;
+        f[i_coord_offset+DIM*0+YY] += fiy0;
+        f[i_coord_offset+DIM*0+ZZ] += fiz0;
+        tx                         += fix0;
+        ty                         += fiy0;
+        tz                         += fiz0;
+        f[i_coord_offset+DIM*1+XX] += fix1;
+        f[i_coord_offset+DIM*1+YY] += fiy1;
+        f[i_coord_offset+DIM*1+ZZ] += fiz1;
+        tx                         += fix1;
+        ty                         += fiy1;
+        tz                         += fiz1;
+        f[i_coord_offset+DIM*2+XX] += fix2;
+        f[i_coord_offset+DIM*2+YY] += fiy2;
+        f[i_coord_offset+DIM*2+ZZ] += fiz2;
+        tx                         += fix2;
+        ty                         += fiy2;
+        tz                         += fiz2;
+        fshift[i_shift_offset+XX]  += tx;
+        fshift[i_shift_offset+YY]  += ty;
+        fshift[i_shift_offset+ZZ]  += tz;
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 30 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*30 + inneriter*123);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_c.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_c.c
new file mode 100644 (file)
index 0000000..2836f45
--- /dev/null
@@ -0,0 +1,1312 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS c kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_c
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_c
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    int              i_shift_offset,i_coord_offset,j_coord_offset;
+    int              j_index_start,j_index_end;
+    int              nri,inr,ggid,iidx,jidx,jnr,outeriter,inneriter;
+    real             shX,shY,shZ,tx,ty,tz,fscal,rcutoff,rcutoff2;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             *shiftvec,*fshift,*x,*f;
+    int              vdwioffset0;
+    real             ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    real             ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    real             ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0;
+    real             jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1;
+    real             jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2;
+    real             jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    real             dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00,cexp1_00,cexp2_00;
+    real             dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01,cexp1_01,cexp2_01;
+    real             dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02,cexp1_02,cexp2_02;
+    real             dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10,cexp1_10,cexp2_10;
+    real             dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11,cexp1_11,cexp2_11;
+    real             dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12,cexp1_12,cexp2_12;
+    real             dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20,cexp1_20,cexp2_20;
+    real             dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21,cexp1_21,cexp2_21;
+    real             dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22,cexp1_22,cexp2_22;
+    real             velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    real             rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,br,vvdwexp,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    real             c6grid_00;
+    real             c6grid_01;
+    real             c6grid_02;
+    real             c6grid_10;
+    real             c6grid_11;
+    real             c6grid_12;
+    real             c6grid_20;
+    real             c6grid_21;
+    real             c6grid_22;
+    real             ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,sh_lj_ewald;
+    real            *vdwgridparam;
+    int              ewitab;
+    real             ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace;
+    real             *ewtab;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = fr->epsfac;
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    ewclj            = fr->ewaldcoeff_lj;
+    sh_lj_ewald             = fr->ic->sh_lj_ewald;
+    ewclj2           = ewclj*ewclj;
+    ewclj6           = ewclj2*ewclj2*ewclj2;
+
+    sh_ewald         = fr->ic->sh_ewald;
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = fr->ic->tabq_scale;
+    ewtabhalfspace   = 0.5/ewtabscale;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = facel*charge[inr+0];
+    iq1              = facel*charge[inr+1];
+    iq2              = facel*charge[inr+2];
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = charge[inr+0];
+    jq1              = charge[inr+1];
+    jq2              = charge[inr+2];
+    vdwjidx0         = 2*vdwtype[inr+0];
+    qq00             = iq0*jq0;
+    c6_00            = vdwparam[vdwioffset0+vdwjidx0];
+    c12_00           = vdwparam[vdwioffset0+vdwjidx0+1];
+    c6grid_00        = vdwgridparam[vdwioffset0+vdwjidx0];
+    qq01             = iq0*jq1;
+    qq02             = iq0*jq2;
+    qq10             = iq1*jq0;
+    qq11             = iq1*jq1;
+    qq12             = iq1*jq2;
+    qq20             = iq2*jq0;
+    qq21             = iq2*jq1;
+    qq22             = iq2*jq2;
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff          = fr->rcoulomb;
+    rcutoff2         = rcutoff*rcutoff;
+
+    sh_vdw_invrcut6  = fr->ic->sh_invrc6;
+    rvdw             = fr->rvdw;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = shX + x[i_coord_offset+DIM*0+XX];
+        iy0              = shY + x[i_coord_offset+DIM*0+YY];
+        iz0              = shZ + x[i_coord_offset+DIM*0+ZZ];
+        ix1              = shX + x[i_coord_offset+DIM*1+XX];
+        iy1              = shY + x[i_coord_offset+DIM*1+YY];
+        iz1              = shZ + x[i_coord_offset+DIM*1+ZZ];
+        ix2              = shX + x[i_coord_offset+DIM*2+XX];
+        iy2              = shY + x[i_coord_offset+DIM*2+YY];
+        iz2              = shZ + x[i_coord_offset+DIM*2+ZZ];
+
+        fix0             = 0.0;
+        fiy0             = 0.0;
+        fiz0             = 0.0;
+        fix1             = 0.0;
+        fiy1             = 0.0;
+        fiz1             = 0.0;
+        fix2             = 0.0;
+        fiy2             = 0.0;
+        fiz2             = 0.0;
+
+        /* Reset potential sums */
+        velecsum         = 0.0;
+        vvdwsum          = 0.0;
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end; jidx++)
+        {
+            /* Get j neighbor index, and coordinate index */
+            jnr              = jjnr[jidx];
+            j_coord_offset   = DIM*jnr;
+
+            /* load j atom coordinates */
+            jx0              = x[j_coord_offset+DIM*0+XX];
+            jy0              = x[j_coord_offset+DIM*0+YY];
+            jz0              = x[j_coord_offset+DIM*0+ZZ];
+            jx1              = x[j_coord_offset+DIM*1+XX];
+            jy1              = x[j_coord_offset+DIM*1+YY];
+            jz1              = x[j_coord_offset+DIM*1+ZZ];
+            jx2              = x[j_coord_offset+DIM*2+XX];
+            jy2              = x[j_coord_offset+DIM*2+YY];
+            jz2              = x[j_coord_offset+DIM*2+ZZ];
+
+            /* Calculate displacement vector */
+            dx00             = ix0 - jx0;
+            dy00             = iy0 - jy0;
+            dz00             = iz0 - jz0;
+            dx01             = ix0 - jx1;
+            dy01             = iy0 - jy1;
+            dz01             = iz0 - jz1;
+            dx02             = ix0 - jx2;
+            dy02             = iy0 - jy2;
+            dz02             = iz0 - jz2;
+            dx10             = ix1 - jx0;
+            dy10             = iy1 - jy0;
+            dz10             = iz1 - jz0;
+            dx11             = ix1 - jx1;
+            dy11             = iy1 - jy1;
+            dz11             = iz1 - jz1;
+            dx12             = ix1 - jx2;
+            dy12             = iy1 - jy2;
+            dz12             = iz1 - jz2;
+            dx20             = ix2 - jx0;
+            dy20             = iy2 - jy0;
+            dz20             = iz2 - jz0;
+            dx21             = ix2 - jx1;
+            dy21             = iy2 - jy1;
+            dz21             = iz2 - jz1;
+            dx22             = ix2 - jx2;
+            dy22             = iy2 - jy2;
+            dz22             = iz2 - jz2;
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = dx00*dx00+dy00*dy00+dz00*dz00;
+            rsq01            = dx01*dx01+dy01*dy01+dz01*dz01;
+            rsq02            = dx02*dx02+dy02*dy02+dz02*dz02;
+            rsq10            = dx10*dx10+dy10*dy10+dz10*dz10;
+            rsq11            = dx11*dx11+dy11*dy11+dz11*dz11;
+            rsq12            = dx12*dx12+dy12*dy12+dz12*dz12;
+            rsq20            = dx20*dx20+dy20*dy20+dz20*dz20;
+            rsq21            = dx21*dx21+dy21*dy21+dz21*dz21;
+            rsq22            = dx22*dx22+dy22*dy22+dz22*dz22;
+
+            rinv00           = gmx_invsqrt(rsq00);
+            rinv01           = gmx_invsqrt(rsq01);
+            rinv02           = gmx_invsqrt(rsq02);
+            rinv10           = gmx_invsqrt(rsq10);
+            rinv11           = gmx_invsqrt(rsq11);
+            rinv12           = gmx_invsqrt(rsq12);
+            rinv20           = gmx_invsqrt(rsq20);
+            rinv21           = gmx_invsqrt(rsq21);
+            rinv22           = gmx_invsqrt(rsq22);
+
+            rinvsq00         = rinv00*rinv00;
+            rinvsq01         = rinv01*rinv01;
+            rinvsq02         = rinv02*rinv02;
+            rinvsq10         = rinv10*rinv10;
+            rinvsq11         = rinv11*rinv11;
+            rinvsq12         = rinv12*rinv12;
+            rinvsq20         = rinv20*rinv20;
+            rinvsq21         = rinv21*rinv21;
+            rinvsq22         = rinv22*rinv22;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq00<rcutoff2)
+            {
+
+            r00              = rsq00*rinv00;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r00*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq00*((rinv00-sh_ewald)-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq00*rinv00*(rinvsq00-felec);
+
+            rinvsix          = rinvsq00*rinvsq00*rinvsq00;
+            ewcljrsq         = ewclj2*rsq00;
+            exponent        = exp(-ewcljrsq);
+            poly             = exponent*(1.0 + ewcljrsq + ewcljrsq*ewcljrsq*0.5);
+            vvdw6            = (c6_00-c6grid_00*(1.0-poly))*rinvsix;
+            vvdw12           = c12_00*rinvsix*rinvsix;
+            vvdw             = (vvdw12 - c12_00*sh_vdw_invrcut6*sh_vdw_invrcut6)*(1.0/12.0) - (vvdw6 - c6_00*sh_vdw_invrcut6 - c6grid_00*sh_lj_ewald)*(1.0/6.0);
+            fvdw             = (vvdw12 - vvdw6 - c6grid_00*(1.0/6.0)*exponent*ewclj6)*rinvsq00;
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+            vvdwsum         += vvdw;
+
+            fscal            = felec+fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx00;
+            ty               = fscal*dy00;
+            tz               = fscal*dz00;
+
+            /* Update vectorial force */
+            fix0            += tx;
+            fiy0            += ty;
+            fiz0            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq01<rcutoff2)
+            {
+
+            r01              = rsq01*rinv01;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r01*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq01*((rinv01-sh_ewald)-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq01*rinv01*(rinvsq01-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx01;
+            ty               = fscal*dy01;
+            tz               = fscal*dz01;
+
+            /* Update vectorial force */
+            fix0            += tx;
+            fiy0            += ty;
+            fiz0            += tz;
+            f[j_coord_offset+DIM*1+XX] -= tx;
+            f[j_coord_offset+DIM*1+YY] -= ty;
+            f[j_coord_offset+DIM*1+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq02<rcutoff2)
+            {
+
+            r02              = rsq02*rinv02;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r02*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq02*((rinv02-sh_ewald)-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq02*rinv02*(rinvsq02-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx02;
+            ty               = fscal*dy02;
+            tz               = fscal*dz02;
+
+            /* Update vectorial force */
+            fix0            += tx;
+            fiy0            += ty;
+            fiz0            += tz;
+            f[j_coord_offset+DIM*2+XX] -= tx;
+            f[j_coord_offset+DIM*2+YY] -= ty;
+            f[j_coord_offset+DIM*2+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq10<rcutoff2)
+            {
+
+            r10              = rsq10*rinv10;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r10*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq10*((rinv10-sh_ewald)-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq10*rinv10*(rinvsq10-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx10;
+            ty               = fscal*dy10;
+            tz               = fscal*dz10;
+
+            /* Update vectorial force */
+            fix1            += tx;
+            fiy1            += ty;
+            fiz1            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq11<rcutoff2)
+            {
+
+            r11              = rsq11*rinv11;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r11*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq11*((rinv11-sh_ewald)-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq11*rinv11*(rinvsq11-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx11;
+            ty               = fscal*dy11;
+            tz               = fscal*dz11;
+
+            /* Update vectorial force */
+            fix1            += tx;
+            fiy1            += ty;
+            fiz1            += tz;
+            f[j_coord_offset+DIM*1+XX] -= tx;
+            f[j_coord_offset+DIM*1+YY] -= ty;
+            f[j_coord_offset+DIM*1+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq12<rcutoff2)
+            {
+
+            r12              = rsq12*rinv12;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r12*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq12*((rinv12-sh_ewald)-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq12*rinv12*(rinvsq12-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx12;
+            ty               = fscal*dy12;
+            tz               = fscal*dz12;
+
+            /* Update vectorial force */
+            fix1            += tx;
+            fiy1            += ty;
+            fiz1            += tz;
+            f[j_coord_offset+DIM*2+XX] -= tx;
+            f[j_coord_offset+DIM*2+YY] -= ty;
+            f[j_coord_offset+DIM*2+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq20<rcutoff2)
+            {
+
+            r20              = rsq20*rinv20;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r20*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq20*((rinv20-sh_ewald)-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq20*rinv20*(rinvsq20-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx20;
+            ty               = fscal*dy20;
+            tz               = fscal*dz20;
+
+            /* Update vectorial force */
+            fix2            += tx;
+            fiy2            += ty;
+            fiz2            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq21<rcutoff2)
+            {
+
+            r21              = rsq21*rinv21;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r21*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq21*((rinv21-sh_ewald)-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq21*rinv21*(rinvsq21-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx21;
+            ty               = fscal*dy21;
+            tz               = fscal*dz21;
+
+            /* Update vectorial force */
+            fix2            += tx;
+            fiy2            += ty;
+            fiz2            += tz;
+            f[j_coord_offset+DIM*1+XX] -= tx;
+            f[j_coord_offset+DIM*1+YY] -= ty;
+            f[j_coord_offset+DIM*1+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq22<rcutoff2)
+            {
+
+            r22              = rsq22*rinv22;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r22*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq22*((rinv22-sh_ewald)-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq22*rinv22*(rinvsq22-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx22;
+            ty               = fscal*dy22;
+            tz               = fscal*dz22;
+
+            /* Update vectorial force */
+            fix2            += tx;
+            fiy2            += ty;
+            fiz2            += tz;
+            f[j_coord_offset+DIM*2+XX] -= tx;
+            f[j_coord_offset+DIM*2+YY] -= ty;
+            f[j_coord_offset+DIM*2+ZZ] -= tz;
+
+            }
+
+            /* Inner loop uses 401 flops */
+        }
+        /* End of innermost loop */
+
+        tx = ty = tz = 0;
+        f[i_coord_offset+DIM*0+XX] += fix0;
+        f[i_coord_offset+DIM*0+YY] += fiy0;
+        f[i_coord_offset+DIM*0+ZZ] += fiz0;
+        tx                         += fix0;
+        ty                         += fiy0;
+        tz                         += fiz0;
+        f[i_coord_offset+DIM*1+XX] += fix1;
+        f[i_coord_offset+DIM*1+YY] += fiy1;
+        f[i_coord_offset+DIM*1+ZZ] += fiz1;
+        tx                         += fix1;
+        ty                         += fiy1;
+        tz                         += fiz1;
+        f[i_coord_offset+DIM*2+XX] += fix2;
+        f[i_coord_offset+DIM*2+YY] += fiy2;
+        f[i_coord_offset+DIM*2+ZZ] += fiz2;
+        tx                         += fix2;
+        ty                         += fiy2;
+        tz                         += fiz2;
+        fshift[i_shift_offset+XX]  += tx;
+        fshift[i_shift_offset+YY]  += ty;
+        fshift[i_shift_offset+ZZ]  += tz;
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        kernel_data->energygrp_elec[ggid] += velecsum;
+        kernel_data->energygrp_vdw[ggid] += vvdwsum;
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 32 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*32 + inneriter*401);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_c
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_c
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    int              i_shift_offset,i_coord_offset,j_coord_offset;
+    int              j_index_start,j_index_end;
+    int              nri,inr,ggid,iidx,jidx,jnr,outeriter,inneriter;
+    real             shX,shY,shZ,tx,ty,tz,fscal,rcutoff,rcutoff2;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             *shiftvec,*fshift,*x,*f;
+    int              vdwioffset0;
+    real             ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    real             ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    real             ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0;
+    real             jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1;
+    real             jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2;
+    real             jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    real             dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00,cexp1_00,cexp2_00;
+    real             dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01,cexp1_01,cexp2_01;
+    real             dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02,cexp1_02,cexp2_02;
+    real             dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10,cexp1_10,cexp2_10;
+    real             dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11,cexp1_11,cexp2_11;
+    real             dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12,cexp1_12,cexp2_12;
+    real             dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20,cexp1_20,cexp2_20;
+    real             dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21,cexp1_21,cexp2_21;
+    real             dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22,cexp1_22,cexp2_22;
+    real             velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    real             rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,br,vvdwexp,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    real             c6grid_00;
+    real             c6grid_01;
+    real             c6grid_02;
+    real             c6grid_10;
+    real             c6grid_11;
+    real             c6grid_12;
+    real             c6grid_20;
+    real             c6grid_21;
+    real             c6grid_22;
+    real             ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,sh_lj_ewald;
+    real            *vdwgridparam;
+    int              ewitab;
+    real             ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace;
+    real             *ewtab;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = fr->epsfac;
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    ewclj            = fr->ewaldcoeff_lj;
+    sh_lj_ewald             = fr->ic->sh_lj_ewald;
+    ewclj2           = ewclj*ewclj;
+    ewclj6           = ewclj2*ewclj2*ewclj2;
+
+    sh_ewald         = fr->ic->sh_ewald;
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = fr->ic->tabq_scale;
+    ewtabhalfspace   = 0.5/ewtabscale;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = facel*charge[inr+0];
+    iq1              = facel*charge[inr+1];
+    iq2              = facel*charge[inr+2];
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = charge[inr+0];
+    jq1              = charge[inr+1];
+    jq2              = charge[inr+2];
+    vdwjidx0         = 2*vdwtype[inr+0];
+    qq00             = iq0*jq0;
+    c6_00            = vdwparam[vdwioffset0+vdwjidx0];
+    c12_00           = vdwparam[vdwioffset0+vdwjidx0+1];
+    c6grid_00        = vdwgridparam[vdwioffset0+vdwjidx0];
+    qq01             = iq0*jq1;
+    qq02             = iq0*jq2;
+    qq10             = iq1*jq0;
+    qq11             = iq1*jq1;
+    qq12             = iq1*jq2;
+    qq20             = iq2*jq0;
+    qq21             = iq2*jq1;
+    qq22             = iq2*jq2;
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff          = fr->rcoulomb;
+    rcutoff2         = rcutoff*rcutoff;
+
+    sh_vdw_invrcut6  = fr->ic->sh_invrc6;
+    rvdw             = fr->rvdw;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = shX + x[i_coord_offset+DIM*0+XX];
+        iy0              = shY + x[i_coord_offset+DIM*0+YY];
+        iz0              = shZ + x[i_coord_offset+DIM*0+ZZ];
+        ix1              = shX + x[i_coord_offset+DIM*1+XX];
+        iy1              = shY + x[i_coord_offset+DIM*1+YY];
+        iz1              = shZ + x[i_coord_offset+DIM*1+ZZ];
+        ix2              = shX + x[i_coord_offset+DIM*2+XX];
+        iy2              = shY + x[i_coord_offset+DIM*2+YY];
+        iz2              = shZ + x[i_coord_offset+DIM*2+ZZ];
+
+        fix0             = 0.0;
+        fiy0             = 0.0;
+        fiz0             = 0.0;
+        fix1             = 0.0;
+        fiy1             = 0.0;
+        fiz1             = 0.0;
+        fix2             = 0.0;
+        fiy2             = 0.0;
+        fiz2             = 0.0;
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end; jidx++)
+        {
+            /* Get j neighbor index, and coordinate index */
+            jnr              = jjnr[jidx];
+            j_coord_offset   = DIM*jnr;
+
+            /* load j atom coordinates */
+            jx0              = x[j_coord_offset+DIM*0+XX];
+            jy0              = x[j_coord_offset+DIM*0+YY];
+            jz0              = x[j_coord_offset+DIM*0+ZZ];
+            jx1              = x[j_coord_offset+DIM*1+XX];
+            jy1              = x[j_coord_offset+DIM*1+YY];
+            jz1              = x[j_coord_offset+DIM*1+ZZ];
+            jx2              = x[j_coord_offset+DIM*2+XX];
+            jy2              = x[j_coord_offset+DIM*2+YY];
+            jz2              = x[j_coord_offset+DIM*2+ZZ];
+
+            /* Calculate displacement vector */
+            dx00             = ix0 - jx0;
+            dy00             = iy0 - jy0;
+            dz00             = iz0 - jz0;
+            dx01             = ix0 - jx1;
+            dy01             = iy0 - jy1;
+            dz01             = iz0 - jz1;
+            dx02             = ix0 - jx2;
+            dy02             = iy0 - jy2;
+            dz02             = iz0 - jz2;
+            dx10             = ix1 - jx0;
+            dy10             = iy1 - jy0;
+            dz10             = iz1 - jz0;
+            dx11             = ix1 - jx1;
+            dy11             = iy1 - jy1;
+            dz11             = iz1 - jz1;
+            dx12             = ix1 - jx2;
+            dy12             = iy1 - jy2;
+            dz12             = iz1 - jz2;
+            dx20             = ix2 - jx0;
+            dy20             = iy2 - jy0;
+            dz20             = iz2 - jz0;
+            dx21             = ix2 - jx1;
+            dy21             = iy2 - jy1;
+            dz21             = iz2 - jz1;
+            dx22             = ix2 - jx2;
+            dy22             = iy2 - jy2;
+            dz22             = iz2 - jz2;
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = dx00*dx00+dy00*dy00+dz00*dz00;
+            rsq01            = dx01*dx01+dy01*dy01+dz01*dz01;
+            rsq02            = dx02*dx02+dy02*dy02+dz02*dz02;
+            rsq10            = dx10*dx10+dy10*dy10+dz10*dz10;
+            rsq11            = dx11*dx11+dy11*dy11+dz11*dz11;
+            rsq12            = dx12*dx12+dy12*dy12+dz12*dz12;
+            rsq20            = dx20*dx20+dy20*dy20+dz20*dz20;
+            rsq21            = dx21*dx21+dy21*dy21+dz21*dz21;
+            rsq22            = dx22*dx22+dy22*dy22+dz22*dz22;
+
+            rinv00           = gmx_invsqrt(rsq00);
+            rinv01           = gmx_invsqrt(rsq01);
+            rinv02           = gmx_invsqrt(rsq02);
+            rinv10           = gmx_invsqrt(rsq10);
+            rinv11           = gmx_invsqrt(rsq11);
+            rinv12           = gmx_invsqrt(rsq12);
+            rinv20           = gmx_invsqrt(rsq20);
+            rinv21           = gmx_invsqrt(rsq21);
+            rinv22           = gmx_invsqrt(rsq22);
+
+            rinvsq00         = rinv00*rinv00;
+            rinvsq01         = rinv01*rinv01;
+            rinvsq02         = rinv02*rinv02;
+            rinvsq10         = rinv10*rinv10;
+            rinvsq11         = rinv11*rinv11;
+            rinvsq12         = rinv12*rinv12;
+            rinvsq20         = rinv20*rinv20;
+            rinvsq21         = rinv21*rinv21;
+            rinvsq22         = rinv22*rinv22;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq00<rcutoff2)
+            {
+
+            r00              = rsq00*rinv00;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r00*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq00*rinv00*(rinvsq00-felec);
+
+            rinvsix          = rinvsq00*rinvsq00*rinvsq00;
+            ewcljrsq         = ewclj2*rsq00;
+            exponent        = exp(-ewcljrsq);
+            poly             = exponent*(1.0 + ewcljrsq + ewcljrsq*ewcljrsq*0.5);
+            fvdw             = (((c12_00*rinvsix - c6_00 + c6grid_00*(1.0-poly))*rinvsix) - c6grid_00*(1.0/6.0)*exponent*ewclj6)*rinvsq00;
+
+            fscal            = felec+fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx00;
+            ty               = fscal*dy00;
+            tz               = fscal*dz00;
+
+            /* Update vectorial force */
+            fix0            += tx;
+            fiy0            += ty;
+            fiz0            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq01<rcutoff2)
+            {
+
+            r01              = rsq01*rinv01;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r01*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq01*rinv01*(rinvsq01-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx01;
+            ty               = fscal*dy01;
+            tz               = fscal*dz01;
+
+            /* Update vectorial force */
+            fix0            += tx;
+            fiy0            += ty;
+            fiz0            += tz;
+            f[j_coord_offset+DIM*1+XX] -= tx;
+            f[j_coord_offset+DIM*1+YY] -= ty;
+            f[j_coord_offset+DIM*1+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq02<rcutoff2)
+            {
+
+            r02              = rsq02*rinv02;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r02*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq02*rinv02*(rinvsq02-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx02;
+            ty               = fscal*dy02;
+            tz               = fscal*dz02;
+
+            /* Update vectorial force */
+            fix0            += tx;
+            fiy0            += ty;
+            fiz0            += tz;
+            f[j_coord_offset+DIM*2+XX] -= tx;
+            f[j_coord_offset+DIM*2+YY] -= ty;
+            f[j_coord_offset+DIM*2+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq10<rcutoff2)
+            {
+
+            r10              = rsq10*rinv10;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r10*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq10*rinv10*(rinvsq10-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx10;
+            ty               = fscal*dy10;
+            tz               = fscal*dz10;
+
+            /* Update vectorial force */
+            fix1            += tx;
+            fiy1            += ty;
+            fiz1            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq11<rcutoff2)
+            {
+
+            r11              = rsq11*rinv11;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r11*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq11*rinv11*(rinvsq11-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx11;
+            ty               = fscal*dy11;
+            tz               = fscal*dz11;
+
+            /* Update vectorial force */
+            fix1            += tx;
+            fiy1            += ty;
+            fiz1            += tz;
+            f[j_coord_offset+DIM*1+XX] -= tx;
+            f[j_coord_offset+DIM*1+YY] -= ty;
+            f[j_coord_offset+DIM*1+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq12<rcutoff2)
+            {
+
+            r12              = rsq12*rinv12;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r12*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq12*rinv12*(rinvsq12-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx12;
+            ty               = fscal*dy12;
+            tz               = fscal*dz12;
+
+            /* Update vectorial force */
+            fix1            += tx;
+            fiy1            += ty;
+            fiz1            += tz;
+            f[j_coord_offset+DIM*2+XX] -= tx;
+            f[j_coord_offset+DIM*2+YY] -= ty;
+            f[j_coord_offset+DIM*2+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq20<rcutoff2)
+            {
+
+            r20              = rsq20*rinv20;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r20*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq20*rinv20*(rinvsq20-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx20;
+            ty               = fscal*dy20;
+            tz               = fscal*dz20;
+
+            /* Update vectorial force */
+            fix2            += tx;
+            fiy2            += ty;
+            fiz2            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq21<rcutoff2)
+            {
+
+            r21              = rsq21*rinv21;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r21*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq21*rinv21*(rinvsq21-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx21;
+            ty               = fscal*dy21;
+            tz               = fscal*dz21;
+
+            /* Update vectorial force */
+            fix2            += tx;
+            fiy2            += ty;
+            fiz2            += tz;
+            f[j_coord_offset+DIM*1+XX] -= tx;
+            f[j_coord_offset+DIM*1+YY] -= ty;
+            f[j_coord_offset+DIM*1+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq22<rcutoff2)
+            {
+
+            r22              = rsq22*rinv22;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r22*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq22*rinv22*(rinvsq22-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx22;
+            ty               = fscal*dy22;
+            tz               = fscal*dz22;
+
+            /* Update vectorial force */
+            fix2            += tx;
+            fiy2            += ty;
+            fiz2            += tz;
+            f[j_coord_offset+DIM*2+XX] -= tx;
+            f[j_coord_offset+DIM*2+YY] -= ty;
+            f[j_coord_offset+DIM*2+ZZ] -= tz;
+
+            }
+
+            /* Inner loop uses 318 flops */
+        }
+        /* End of innermost loop */
+
+        tx = ty = tz = 0;
+        f[i_coord_offset+DIM*0+XX] += fix0;
+        f[i_coord_offset+DIM*0+YY] += fiy0;
+        f[i_coord_offset+DIM*0+ZZ] += fiz0;
+        tx                         += fix0;
+        ty                         += fiy0;
+        tz                         += fiz0;
+        f[i_coord_offset+DIM*1+XX] += fix1;
+        f[i_coord_offset+DIM*1+YY] += fiy1;
+        f[i_coord_offset+DIM*1+ZZ] += fiz1;
+        tx                         += fix1;
+        ty                         += fiy1;
+        tz                         += fiz1;
+        f[i_coord_offset+DIM*2+XX] += fix2;
+        f[i_coord_offset+DIM*2+YY] += fiy2;
+        f[i_coord_offset+DIM*2+ZZ] += fiz2;
+        tx                         += fix2;
+        ty                         += fiy2;
+        tz                         += fiz2;
+        fshift[i_shift_offset+XX]  += tx;
+        fshift[i_shift_offset+YY]  += ty;
+        fshift[i_shift_offset+ZZ]  += tz;
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 30 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*30 + inneriter*318);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_c.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_c.c
new file mode 100644 (file)
index 0000000..82c747d
--- /dev/null
@@ -0,0 +1,838 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS c kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_c
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_c
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    int              i_shift_offset,i_coord_offset,j_coord_offset;
+    int              j_index_start,j_index_end;
+    int              nri,inr,ggid,iidx,jidx,jnr,outeriter,inneriter;
+    real             shX,shY,shZ,tx,ty,tz,fscal,rcutoff,rcutoff2;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             *shiftvec,*fshift,*x,*f;
+    int              vdwioffset0;
+    real             ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    real             ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    real             ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    real             ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0;
+    real             jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    real             dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00,cexp1_00,cexp2_00;
+    real             dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10,cexp1_10,cexp2_10;
+    real             dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20,cexp1_20,cexp2_20;
+    real             dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30,cexp1_30,cexp2_30;
+    real             velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    real             rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,br,vvdwexp,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    real             c6grid_00;
+    real             c6grid_10;
+    real             c6grid_20;
+    real             c6grid_30;
+    real             ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,sh_lj_ewald;
+    real            *vdwgridparam;
+    int              ewitab;
+    real             ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace;
+    real             *ewtab;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = fr->epsfac;
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    ewclj            = fr->ewaldcoeff_lj;
+    sh_lj_ewald             = fr->ic->sh_lj_ewald;
+    ewclj2           = ewclj*ewclj;
+    ewclj6           = ewclj2*ewclj2*ewclj2;
+
+    sh_ewald         = fr->ic->sh_ewald;
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = fr->ic->tabq_scale;
+    ewtabhalfspace   = 0.5/ewtabscale;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = facel*charge[inr+1];
+    iq2              = facel*charge[inr+2];
+    iq3              = facel*charge[inr+3];
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff          = fr->rcoulomb;
+    rcutoff2         = rcutoff*rcutoff;
+
+    sh_vdw_invrcut6  = fr->ic->sh_invrc6;
+    rvdw             = fr->rvdw;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = shX + x[i_coord_offset+DIM*0+XX];
+        iy0              = shY + x[i_coord_offset+DIM*0+YY];
+        iz0              = shZ + x[i_coord_offset+DIM*0+ZZ];
+        ix1              = shX + x[i_coord_offset+DIM*1+XX];
+        iy1              = shY + x[i_coord_offset+DIM*1+YY];
+        iz1              = shZ + x[i_coord_offset+DIM*1+ZZ];
+        ix2              = shX + x[i_coord_offset+DIM*2+XX];
+        iy2              = shY + x[i_coord_offset+DIM*2+YY];
+        iz2              = shZ + x[i_coord_offset+DIM*2+ZZ];
+        ix3              = shX + x[i_coord_offset+DIM*3+XX];
+        iy3              = shY + x[i_coord_offset+DIM*3+YY];
+        iz3              = shZ + x[i_coord_offset+DIM*3+ZZ];
+
+        fix0             = 0.0;
+        fiy0             = 0.0;
+        fiz0             = 0.0;
+        fix1             = 0.0;
+        fiy1             = 0.0;
+        fiz1             = 0.0;
+        fix2             = 0.0;
+        fiy2             = 0.0;
+        fiz2             = 0.0;
+        fix3             = 0.0;
+        fiy3             = 0.0;
+        fiz3             = 0.0;
+
+        /* Reset potential sums */
+        velecsum         = 0.0;
+        vvdwsum          = 0.0;
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end; jidx++)
+        {
+            /* Get j neighbor index, and coordinate index */
+            jnr              = jjnr[jidx];
+            j_coord_offset   = DIM*jnr;
+
+            /* load j atom coordinates */
+            jx0              = x[j_coord_offset+DIM*0+XX];
+            jy0              = x[j_coord_offset+DIM*0+YY];
+            jz0              = x[j_coord_offset+DIM*0+ZZ];
+
+            /* Calculate displacement vector */
+            dx00             = ix0 - jx0;
+            dy00             = iy0 - jy0;
+            dz00             = iz0 - jz0;
+            dx10             = ix1 - jx0;
+            dy10             = iy1 - jy0;
+            dz10             = iz1 - jz0;
+            dx20             = ix2 - jx0;
+            dy20             = iy2 - jy0;
+            dz20             = iz2 - jz0;
+            dx30             = ix3 - jx0;
+            dy30             = iy3 - jy0;
+            dz30             = iz3 - jz0;
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = dx00*dx00+dy00*dy00+dz00*dz00;
+            rsq10            = dx10*dx10+dy10*dy10+dz10*dz10;
+            rsq20            = dx20*dx20+dy20*dy20+dz20*dz20;
+            rsq30            = dx30*dx30+dy30*dy30+dz30*dz30;
+
+            rinv00           = gmx_invsqrt(rsq00);
+            rinv10           = gmx_invsqrt(rsq10);
+            rinv20           = gmx_invsqrt(rsq20);
+            rinv30           = gmx_invsqrt(rsq30);
+
+            rinvsq00         = rinv00*rinv00;
+            rinvsq10         = rinv10*rinv10;
+            rinvsq20         = rinv20*rinv20;
+            rinvsq30         = rinv30*rinv30;
+
+            /* Load parameters for j particles */
+            jq0              = charge[jnr+0];
+            vdwjidx0         = 2*vdwtype[jnr+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq00<rcutoff2)
+            {
+
+            r00              = rsq00*rinv00;
+
+            c6_00            = vdwparam[vdwioffset0+vdwjidx0];
+            c12_00           = vdwparam[vdwioffset0+vdwjidx0+1];
+            c6grid_00        = vdwgridparam[vdwioffset0+vdwjidx0];
+
+            rinvsix          = rinvsq00*rinvsq00*rinvsq00;
+            ewcljrsq         = ewclj2*rsq00;
+            exponent        = exp(-ewcljrsq);
+            poly             = exponent*(1.0 + ewcljrsq + ewcljrsq*ewcljrsq*0.5);
+            vvdw6            = (c6_00-c6grid_00*(1.0-poly))*rinvsix;
+            vvdw12           = c12_00*rinvsix*rinvsix;
+            vvdw             = (vvdw12 - c12_00*sh_vdw_invrcut6*sh_vdw_invrcut6)*(1.0/12.0) - (vvdw6 - c6_00*sh_vdw_invrcut6 - c6grid_00*sh_lj_ewald)*(1.0/6.0);
+            fvdw             = (vvdw12 - vvdw6 - c6grid_00*(1.0/6.0)*exponent*ewclj6)*rinvsq00;
+
+            /* Update potential sums from outer loop */
+            vvdwsum         += vvdw;
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx00;
+            ty               = fscal*dy00;
+            tz               = fscal*dz00;
+
+            /* Update vectorial force */
+            fix0            += tx;
+            fiy0            += ty;
+            fiz0            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq10<rcutoff2)
+            {
+
+            r10              = rsq10*rinv10;
+
+            qq10             = iq1*jq0;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r10*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq10*((rinv10-sh_ewald)-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq10*rinv10*(rinvsq10-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx10;
+            ty               = fscal*dy10;
+            tz               = fscal*dz10;
+
+            /* Update vectorial force */
+            fix1            += tx;
+            fiy1            += ty;
+            fiz1            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq20<rcutoff2)
+            {
+
+            r20              = rsq20*rinv20;
+
+            qq20             = iq2*jq0;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r20*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq20*((rinv20-sh_ewald)-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq20*rinv20*(rinvsq20-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx20;
+            ty               = fscal*dy20;
+            tz               = fscal*dz20;
+
+            /* Update vectorial force */
+            fix2            += tx;
+            fiy2            += ty;
+            fiz2            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq30<rcutoff2)
+            {
+
+            r30              = rsq30*rinv30;
+
+            qq30             = iq3*jq0;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r30*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq30*((rinv30-sh_ewald)-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq30*rinv30*(rinvsq30-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx30;
+            ty               = fscal*dy30;
+            tz               = fscal*dz30;
+
+            /* Update vectorial force */
+            fix3            += tx;
+            fiy3            += ty;
+            fiz3            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            }
+
+            /* Inner loop uses 181 flops */
+        }
+        /* End of innermost loop */
+
+        tx = ty = tz = 0;
+        f[i_coord_offset+DIM*0+XX] += fix0;
+        f[i_coord_offset+DIM*0+YY] += fiy0;
+        f[i_coord_offset+DIM*0+ZZ] += fiz0;
+        tx                         += fix0;
+        ty                         += fiy0;
+        tz                         += fiz0;
+        f[i_coord_offset+DIM*1+XX] += fix1;
+        f[i_coord_offset+DIM*1+YY] += fiy1;
+        f[i_coord_offset+DIM*1+ZZ] += fiz1;
+        tx                         += fix1;
+        ty                         += fiy1;
+        tz                         += fiz1;
+        f[i_coord_offset+DIM*2+XX] += fix2;
+        f[i_coord_offset+DIM*2+YY] += fiy2;
+        f[i_coord_offset+DIM*2+ZZ] += fiz2;
+        tx                         += fix2;
+        ty                         += fiy2;
+        tz                         += fiz2;
+        f[i_coord_offset+DIM*3+XX] += fix3;
+        f[i_coord_offset+DIM*3+YY] += fiy3;
+        f[i_coord_offset+DIM*3+ZZ] += fiz3;
+        tx                         += fix3;
+        ty                         += fiy3;
+        tz                         += fiz3;
+        fshift[i_shift_offset+XX]  += tx;
+        fshift[i_shift_offset+YY]  += ty;
+        fshift[i_shift_offset+ZZ]  += tz;
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        kernel_data->energygrp_elec[ggid] += velecsum;
+        kernel_data->energygrp_vdw[ggid] += vvdwsum;
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 41 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*41 + inneriter*181);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_c
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_c
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    int              i_shift_offset,i_coord_offset,j_coord_offset;
+    int              j_index_start,j_index_end;
+    int              nri,inr,ggid,iidx,jidx,jnr,outeriter,inneriter;
+    real             shX,shY,shZ,tx,ty,tz,fscal,rcutoff,rcutoff2;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             *shiftvec,*fshift,*x,*f;
+    int              vdwioffset0;
+    real             ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    real             ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    real             ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    real             ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0;
+    real             jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    real             dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00,cexp1_00,cexp2_00;
+    real             dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10,cexp1_10,cexp2_10;
+    real             dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20,cexp1_20,cexp2_20;
+    real             dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30,cexp1_30,cexp2_30;
+    real             velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    real             rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,br,vvdwexp,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    real             c6grid_00;
+    real             c6grid_10;
+    real             c6grid_20;
+    real             c6grid_30;
+    real             ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,sh_lj_ewald;
+    real            *vdwgridparam;
+    int              ewitab;
+    real             ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace;
+    real             *ewtab;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = fr->epsfac;
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    ewclj            = fr->ewaldcoeff_lj;
+    sh_lj_ewald             = fr->ic->sh_lj_ewald;
+    ewclj2           = ewclj*ewclj;
+    ewclj6           = ewclj2*ewclj2*ewclj2;
+
+    sh_ewald         = fr->ic->sh_ewald;
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = fr->ic->tabq_scale;
+    ewtabhalfspace   = 0.5/ewtabscale;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = facel*charge[inr+1];
+    iq2              = facel*charge[inr+2];
+    iq3              = facel*charge[inr+3];
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff          = fr->rcoulomb;
+    rcutoff2         = rcutoff*rcutoff;
+
+    sh_vdw_invrcut6  = fr->ic->sh_invrc6;
+    rvdw             = fr->rvdw;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = shX + x[i_coord_offset+DIM*0+XX];
+        iy0              = shY + x[i_coord_offset+DIM*0+YY];
+        iz0              = shZ + x[i_coord_offset+DIM*0+ZZ];
+        ix1              = shX + x[i_coord_offset+DIM*1+XX];
+        iy1              = shY + x[i_coord_offset+DIM*1+YY];
+        iz1              = shZ + x[i_coord_offset+DIM*1+ZZ];
+        ix2              = shX + x[i_coord_offset+DIM*2+XX];
+        iy2              = shY + x[i_coord_offset+DIM*2+YY];
+        iz2              = shZ + x[i_coord_offset+DIM*2+ZZ];
+        ix3              = shX + x[i_coord_offset+DIM*3+XX];
+        iy3              = shY + x[i_coord_offset+DIM*3+YY];
+        iz3              = shZ + x[i_coord_offset+DIM*3+ZZ];
+
+        fix0             = 0.0;
+        fiy0             = 0.0;
+        fiz0             = 0.0;
+        fix1             = 0.0;
+        fiy1             = 0.0;
+        fiz1             = 0.0;
+        fix2             = 0.0;
+        fiy2             = 0.0;
+        fiz2             = 0.0;
+        fix3             = 0.0;
+        fiy3             = 0.0;
+        fiz3             = 0.0;
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end; jidx++)
+        {
+            /* Get j neighbor index, and coordinate index */
+            jnr              = jjnr[jidx];
+            j_coord_offset   = DIM*jnr;
+
+            /* load j atom coordinates */
+            jx0              = x[j_coord_offset+DIM*0+XX];
+            jy0              = x[j_coord_offset+DIM*0+YY];
+            jz0              = x[j_coord_offset+DIM*0+ZZ];
+
+            /* Calculate displacement vector */
+            dx00             = ix0 - jx0;
+            dy00             = iy0 - jy0;
+            dz00             = iz0 - jz0;
+            dx10             = ix1 - jx0;
+            dy10             = iy1 - jy0;
+            dz10             = iz1 - jz0;
+            dx20             = ix2 - jx0;
+            dy20             = iy2 - jy0;
+            dz20             = iz2 - jz0;
+            dx30             = ix3 - jx0;
+            dy30             = iy3 - jy0;
+            dz30             = iz3 - jz0;
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = dx00*dx00+dy00*dy00+dz00*dz00;
+            rsq10            = dx10*dx10+dy10*dy10+dz10*dz10;
+            rsq20            = dx20*dx20+dy20*dy20+dz20*dz20;
+            rsq30            = dx30*dx30+dy30*dy30+dz30*dz30;
+
+            rinv00           = gmx_invsqrt(rsq00);
+            rinv10           = gmx_invsqrt(rsq10);
+            rinv20           = gmx_invsqrt(rsq20);
+            rinv30           = gmx_invsqrt(rsq30);
+
+            rinvsq00         = rinv00*rinv00;
+            rinvsq10         = rinv10*rinv10;
+            rinvsq20         = rinv20*rinv20;
+            rinvsq30         = rinv30*rinv30;
+
+            /* Load parameters for j particles */
+            jq0              = charge[jnr+0];
+            vdwjidx0         = 2*vdwtype[jnr+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq00<rcutoff2)
+            {
+
+            r00              = rsq00*rinv00;
+
+            c6_00            = vdwparam[vdwioffset0+vdwjidx0];
+            c12_00           = vdwparam[vdwioffset0+vdwjidx0+1];
+            c6grid_00        = vdwgridparam[vdwioffset0+vdwjidx0];
+
+            rinvsix          = rinvsq00*rinvsq00*rinvsq00;
+            ewcljrsq         = ewclj2*rsq00;
+            exponent        = exp(-ewcljrsq);
+            poly             = exponent*(1.0 + ewcljrsq + ewcljrsq*ewcljrsq*0.5);
+            fvdw             = (((c12_00*rinvsix - c6_00 + c6grid_00*(1.0-poly))*rinvsix) - c6grid_00*(1.0/6.0)*exponent*ewclj6)*rinvsq00;
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx00;
+            ty               = fscal*dy00;
+            tz               = fscal*dz00;
+
+            /* Update vectorial force */
+            fix0            += tx;
+            fiy0            += ty;
+            fiz0            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq10<rcutoff2)
+            {
+
+            r10              = rsq10*rinv10;
+
+            qq10             = iq1*jq0;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r10*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq10*rinv10*(rinvsq10-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx10;
+            ty               = fscal*dy10;
+            tz               = fscal*dz10;
+
+            /* Update vectorial force */
+            fix1            += tx;
+            fiy1            += ty;
+            fiz1            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq20<rcutoff2)
+            {
+
+            r20              = rsq20*rinv20;
+
+            qq20             = iq2*jq0;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r20*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq20*rinv20*(rinvsq20-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx20;
+            ty               = fscal*dy20;
+            tz               = fscal*dz20;
+
+            /* Update vectorial force */
+            fix2            += tx;
+            fiy2            += ty;
+            fiz2            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq30<rcutoff2)
+            {
+
+            r30              = rsq30*rinv30;
+
+            qq30             = iq3*jq0;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r30*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq30*rinv30*(rinvsq30-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx30;
+            ty               = fscal*dy30;
+            tz               = fscal*dz30;
+
+            /* Update vectorial force */
+            fix3            += tx;
+            fiy3            += ty;
+            fiz3            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            }
+
+            /* Inner loop uses 146 flops */
+        }
+        /* End of innermost loop */
+
+        tx = ty = tz = 0;
+        f[i_coord_offset+DIM*0+XX] += fix0;
+        f[i_coord_offset+DIM*0+YY] += fiy0;
+        f[i_coord_offset+DIM*0+ZZ] += fiz0;
+        tx                         += fix0;
+        ty                         += fiy0;
+        tz                         += fiz0;
+        f[i_coord_offset+DIM*1+XX] += fix1;
+        f[i_coord_offset+DIM*1+YY] += fiy1;
+        f[i_coord_offset+DIM*1+ZZ] += fiz1;
+        tx                         += fix1;
+        ty                         += fiy1;
+        tz                         += fiz1;
+        f[i_coord_offset+DIM*2+XX] += fix2;
+        f[i_coord_offset+DIM*2+YY] += fiy2;
+        f[i_coord_offset+DIM*2+ZZ] += fiz2;
+        tx                         += fix2;
+        ty                         += fiy2;
+        tz                         += fiz2;
+        f[i_coord_offset+DIM*3+XX] += fix3;
+        f[i_coord_offset+DIM*3+YY] += fiy3;
+        f[i_coord_offset+DIM*3+ZZ] += fiz3;
+        tx                         += fix3;
+        ty                         += fiy3;
+        tz                         += fiz3;
+        fshift[i_shift_offset+XX]  += tx;
+        fshift[i_shift_offset+YY]  += ty;
+        fshift[i_shift_offset+ZZ]  += tz;
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 39 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*39 + inneriter*146);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_c.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_c.c
new file mode 100644 (file)
index 0000000..708501b
--- /dev/null
@@ -0,0 +1,1420 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS c kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_c
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_c
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    int              i_shift_offset,i_coord_offset,j_coord_offset;
+    int              j_index_start,j_index_end;
+    int              nri,inr,ggid,iidx,jidx,jnr,outeriter,inneriter;
+    real             shX,shY,shZ,tx,ty,tz,fscal,rcutoff,rcutoff2;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             *shiftvec,*fshift,*x,*f;
+    int              vdwioffset0;
+    real             ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    real             ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    real             ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    real             ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0;
+    real             jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1;
+    real             jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2;
+    real             jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3;
+    real             jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    real             dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00,cexp1_00,cexp2_00;
+    real             dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11,cexp1_11,cexp2_11;
+    real             dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12,cexp1_12,cexp2_12;
+    real             dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13,cexp1_13,cexp2_13;
+    real             dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21,cexp1_21,cexp2_21;
+    real             dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22,cexp1_22,cexp2_22;
+    real             dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23,cexp1_23,cexp2_23;
+    real             dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31,cexp1_31,cexp2_31;
+    real             dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32,cexp1_32,cexp2_32;
+    real             dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33,cexp1_33,cexp2_33;
+    real             velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    real             rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,br,vvdwexp,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    real             c6grid_00;
+    real             c6grid_11;
+    real             c6grid_12;
+    real             c6grid_13;
+    real             c6grid_21;
+    real             c6grid_22;
+    real             c6grid_23;
+    real             c6grid_31;
+    real             c6grid_32;
+    real             c6grid_33;
+    real             ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,sh_lj_ewald;
+    real            *vdwgridparam;
+    int              ewitab;
+    real             ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace;
+    real             *ewtab;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = fr->epsfac;
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    ewclj            = fr->ewaldcoeff_lj;
+    sh_lj_ewald             = fr->ic->sh_lj_ewald;
+    ewclj2           = ewclj*ewclj;
+    ewclj6           = ewclj2*ewclj2*ewclj2;
+
+    sh_ewald         = fr->ic->sh_ewald;
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = fr->ic->tabq_scale;
+    ewtabhalfspace   = 0.5/ewtabscale;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = facel*charge[inr+1];
+    iq2              = facel*charge[inr+2];
+    iq3              = facel*charge[inr+3];
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = charge[inr+1];
+    jq2              = charge[inr+2];
+    jq3              = charge[inr+3];
+    vdwjidx0         = 2*vdwtype[inr+0];
+    c6_00            = vdwparam[vdwioffset0+vdwjidx0];
+    c12_00           = vdwparam[vdwioffset0+vdwjidx0+1];
+    c6grid_00        = vdwgridparam[vdwioffset0+vdwjidx0];
+    qq11             = iq1*jq1;
+    qq12             = iq1*jq2;
+    qq13             = iq1*jq3;
+    qq21             = iq2*jq1;
+    qq22             = iq2*jq2;
+    qq23             = iq2*jq3;
+    qq31             = iq3*jq1;
+    qq32             = iq3*jq2;
+    qq33             = iq3*jq3;
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff          = fr->rcoulomb;
+    rcutoff2         = rcutoff*rcutoff;
+
+    sh_vdw_invrcut6  = fr->ic->sh_invrc6;
+    rvdw             = fr->rvdw;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = shX + x[i_coord_offset+DIM*0+XX];
+        iy0              = shY + x[i_coord_offset+DIM*0+YY];
+        iz0              = shZ + x[i_coord_offset+DIM*0+ZZ];
+        ix1              = shX + x[i_coord_offset+DIM*1+XX];
+        iy1              = shY + x[i_coord_offset+DIM*1+YY];
+        iz1              = shZ + x[i_coord_offset+DIM*1+ZZ];
+        ix2              = shX + x[i_coord_offset+DIM*2+XX];
+        iy2              = shY + x[i_coord_offset+DIM*2+YY];
+        iz2              = shZ + x[i_coord_offset+DIM*2+ZZ];
+        ix3              = shX + x[i_coord_offset+DIM*3+XX];
+        iy3              = shY + x[i_coord_offset+DIM*3+YY];
+        iz3              = shZ + x[i_coord_offset+DIM*3+ZZ];
+
+        fix0             = 0.0;
+        fiy0             = 0.0;
+        fiz0             = 0.0;
+        fix1             = 0.0;
+        fiy1             = 0.0;
+        fiz1             = 0.0;
+        fix2             = 0.0;
+        fiy2             = 0.0;
+        fiz2             = 0.0;
+        fix3             = 0.0;
+        fiy3             = 0.0;
+        fiz3             = 0.0;
+
+        /* Reset potential sums */
+        velecsum         = 0.0;
+        vvdwsum          = 0.0;
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end; jidx++)
+        {
+            /* Get j neighbor index, and coordinate index */
+            jnr              = jjnr[jidx];
+            j_coord_offset   = DIM*jnr;
+
+            /* load j atom coordinates */
+            jx0              = x[j_coord_offset+DIM*0+XX];
+            jy0              = x[j_coord_offset+DIM*0+YY];
+            jz0              = x[j_coord_offset+DIM*0+ZZ];
+            jx1              = x[j_coord_offset+DIM*1+XX];
+            jy1              = x[j_coord_offset+DIM*1+YY];
+            jz1              = x[j_coord_offset+DIM*1+ZZ];
+            jx2              = x[j_coord_offset+DIM*2+XX];
+            jy2              = x[j_coord_offset+DIM*2+YY];
+            jz2              = x[j_coord_offset+DIM*2+ZZ];
+            jx3              = x[j_coord_offset+DIM*3+XX];
+            jy3              = x[j_coord_offset+DIM*3+YY];
+            jz3              = x[j_coord_offset+DIM*3+ZZ];
+
+            /* Calculate displacement vector */
+            dx00             = ix0 - jx0;
+            dy00             = iy0 - jy0;
+            dz00             = iz0 - jz0;
+            dx11             = ix1 - jx1;
+            dy11             = iy1 - jy1;
+            dz11             = iz1 - jz1;
+            dx12             = ix1 - jx2;
+            dy12             = iy1 - jy2;
+            dz12             = iz1 - jz2;
+            dx13             = ix1 - jx3;
+            dy13             = iy1 - jy3;
+            dz13             = iz1 - jz3;
+            dx21             = ix2 - jx1;
+            dy21             = iy2 - jy1;
+            dz21             = iz2 - jz1;
+            dx22             = ix2 - jx2;
+            dy22             = iy2 - jy2;
+            dz22             = iz2 - jz2;
+            dx23             = ix2 - jx3;
+            dy23             = iy2 - jy3;
+            dz23             = iz2 - jz3;
+            dx31             = ix3 - jx1;
+            dy31             = iy3 - jy1;
+            dz31             = iz3 - jz1;
+            dx32             = ix3 - jx2;
+            dy32             = iy3 - jy2;
+            dz32             = iz3 - jz2;
+            dx33             = ix3 - jx3;
+            dy33             = iy3 - jy3;
+            dz33             = iz3 - jz3;
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = dx00*dx00+dy00*dy00+dz00*dz00;
+            rsq11            = dx11*dx11+dy11*dy11+dz11*dz11;
+            rsq12            = dx12*dx12+dy12*dy12+dz12*dz12;
+            rsq13            = dx13*dx13+dy13*dy13+dz13*dz13;
+            rsq21            = dx21*dx21+dy21*dy21+dz21*dz21;
+            rsq22            = dx22*dx22+dy22*dy22+dz22*dz22;
+            rsq23            = dx23*dx23+dy23*dy23+dz23*dz23;
+            rsq31            = dx31*dx31+dy31*dy31+dz31*dz31;
+            rsq32            = dx32*dx32+dy32*dy32+dz32*dz32;
+            rsq33            = dx33*dx33+dy33*dy33+dz33*dz33;
+
+            rinv00           = gmx_invsqrt(rsq00);
+            rinv11           = gmx_invsqrt(rsq11);
+            rinv12           = gmx_invsqrt(rsq12);
+            rinv13           = gmx_invsqrt(rsq13);
+            rinv21           = gmx_invsqrt(rsq21);
+            rinv22           = gmx_invsqrt(rsq22);
+            rinv23           = gmx_invsqrt(rsq23);
+            rinv31           = gmx_invsqrt(rsq31);
+            rinv32           = gmx_invsqrt(rsq32);
+            rinv33           = gmx_invsqrt(rsq33);
+
+            rinvsq00         = rinv00*rinv00;
+            rinvsq11         = rinv11*rinv11;
+            rinvsq12         = rinv12*rinv12;
+            rinvsq13         = rinv13*rinv13;
+            rinvsq21         = rinv21*rinv21;
+            rinvsq22         = rinv22*rinv22;
+            rinvsq23         = rinv23*rinv23;
+            rinvsq31         = rinv31*rinv31;
+            rinvsq32         = rinv32*rinv32;
+            rinvsq33         = rinv33*rinv33;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq00<rcutoff2)
+            {
+
+            r00              = rsq00*rinv00;
+
+            rinvsix          = rinvsq00*rinvsq00*rinvsq00;
+            ewcljrsq         = ewclj2*rsq00;
+            exponent        = exp(-ewcljrsq);
+            poly             = exponent*(1.0 + ewcljrsq + ewcljrsq*ewcljrsq*0.5);
+            vvdw6            = (c6_00-c6grid_00*(1.0-poly))*rinvsix;
+            vvdw12           = c12_00*rinvsix*rinvsix;
+            vvdw             = (vvdw12 - c12_00*sh_vdw_invrcut6*sh_vdw_invrcut6)*(1.0/12.0) - (vvdw6 - c6_00*sh_vdw_invrcut6 - c6grid_00*sh_lj_ewald)*(1.0/6.0);
+            fvdw             = (vvdw12 - vvdw6 - c6grid_00*(1.0/6.0)*exponent*ewclj6)*rinvsq00;
+
+            /* Update potential sums from outer loop */
+            vvdwsum         += vvdw;
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx00;
+            ty               = fscal*dy00;
+            tz               = fscal*dz00;
+
+            /* Update vectorial force */
+            fix0            += tx;
+            fiy0            += ty;
+            fiz0            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq11<rcutoff2)
+            {
+
+            r11              = rsq11*rinv11;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r11*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq11*((rinv11-sh_ewald)-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq11*rinv11*(rinvsq11-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx11;
+            ty               = fscal*dy11;
+            tz               = fscal*dz11;
+
+            /* Update vectorial force */
+            fix1            += tx;
+            fiy1            += ty;
+            fiz1            += tz;
+            f[j_coord_offset+DIM*1+XX] -= tx;
+            f[j_coord_offset+DIM*1+YY] -= ty;
+            f[j_coord_offset+DIM*1+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq12<rcutoff2)
+            {
+
+            r12              = rsq12*rinv12;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r12*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq12*((rinv12-sh_ewald)-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq12*rinv12*(rinvsq12-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx12;
+            ty               = fscal*dy12;
+            tz               = fscal*dz12;
+
+            /* Update vectorial force */
+            fix1            += tx;
+            fiy1            += ty;
+            fiz1            += tz;
+            f[j_coord_offset+DIM*2+XX] -= tx;
+            f[j_coord_offset+DIM*2+YY] -= ty;
+            f[j_coord_offset+DIM*2+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq13<rcutoff2)
+            {
+
+            r13              = rsq13*rinv13;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r13*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq13*((rinv13-sh_ewald)-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq13*rinv13*(rinvsq13-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx13;
+            ty               = fscal*dy13;
+            tz               = fscal*dz13;
+
+            /* Update vectorial force */
+            fix1            += tx;
+            fiy1            += ty;
+            fiz1            += tz;
+            f[j_coord_offset+DIM*3+XX] -= tx;
+            f[j_coord_offset+DIM*3+YY] -= ty;
+            f[j_coord_offset+DIM*3+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq21<rcutoff2)
+            {
+
+            r21              = rsq21*rinv21;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r21*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq21*((rinv21-sh_ewald)-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq21*rinv21*(rinvsq21-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx21;
+            ty               = fscal*dy21;
+            tz               = fscal*dz21;
+
+            /* Update vectorial force */
+            fix2            += tx;
+            fiy2            += ty;
+            fiz2            += tz;
+            f[j_coord_offset+DIM*1+XX] -= tx;
+            f[j_coord_offset+DIM*1+YY] -= ty;
+            f[j_coord_offset+DIM*1+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq22<rcutoff2)
+            {
+
+            r22              = rsq22*rinv22;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r22*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq22*((rinv22-sh_ewald)-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq22*rinv22*(rinvsq22-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx22;
+            ty               = fscal*dy22;
+            tz               = fscal*dz22;
+
+            /* Update vectorial force */
+            fix2            += tx;
+            fiy2            += ty;
+            fiz2            += tz;
+            f[j_coord_offset+DIM*2+XX] -= tx;
+            f[j_coord_offset+DIM*2+YY] -= ty;
+            f[j_coord_offset+DIM*2+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq23<rcutoff2)
+            {
+
+            r23              = rsq23*rinv23;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r23*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq23*((rinv23-sh_ewald)-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq23*rinv23*(rinvsq23-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx23;
+            ty               = fscal*dy23;
+            tz               = fscal*dz23;
+
+            /* Update vectorial force */
+            fix2            += tx;
+            fiy2            += ty;
+            fiz2            += tz;
+            f[j_coord_offset+DIM*3+XX] -= tx;
+            f[j_coord_offset+DIM*3+YY] -= ty;
+            f[j_coord_offset+DIM*3+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq31<rcutoff2)
+            {
+
+            r31              = rsq31*rinv31;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r31*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq31*((rinv31-sh_ewald)-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq31*rinv31*(rinvsq31-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx31;
+            ty               = fscal*dy31;
+            tz               = fscal*dz31;
+
+            /* Update vectorial force */
+            fix3            += tx;
+            fiy3            += ty;
+            fiz3            += tz;
+            f[j_coord_offset+DIM*1+XX] -= tx;
+            f[j_coord_offset+DIM*1+YY] -= ty;
+            f[j_coord_offset+DIM*1+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq32<rcutoff2)
+            {
+
+            r32              = rsq32*rinv32;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r32*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq32*((rinv32-sh_ewald)-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq32*rinv32*(rinvsq32-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx32;
+            ty               = fscal*dy32;
+            tz               = fscal*dz32;
+
+            /* Update vectorial force */
+            fix3            += tx;
+            fiy3            += ty;
+            fiz3            += tz;
+            f[j_coord_offset+DIM*2+XX] -= tx;
+            f[j_coord_offset+DIM*2+YY] -= ty;
+            f[j_coord_offset+DIM*2+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq33<rcutoff2)
+            {
+
+            r33              = rsq33*rinv33;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r33*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq33*((rinv33-sh_ewald)-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq33*rinv33*(rinvsq33-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx33;
+            ty               = fscal*dy33;
+            tz               = fscal*dz33;
+
+            /* Update vectorial force */
+            fix3            += tx;
+            fiy3            += ty;
+            fiz3            += tz;
+            f[j_coord_offset+DIM*3+XX] -= tx;
+            f[j_coord_offset+DIM*3+YY] -= ty;
+            f[j_coord_offset+DIM*3+ZZ] -= tz;
+
+            }
+
+            /* Inner loop uses 424 flops */
+        }
+        /* End of innermost loop */
+
+        tx = ty = tz = 0;
+        f[i_coord_offset+DIM*0+XX] += fix0;
+        f[i_coord_offset+DIM*0+YY] += fiy0;
+        f[i_coord_offset+DIM*0+ZZ] += fiz0;
+        tx                         += fix0;
+        ty                         += fiy0;
+        tz                         += fiz0;
+        f[i_coord_offset+DIM*1+XX] += fix1;
+        f[i_coord_offset+DIM*1+YY] += fiy1;
+        f[i_coord_offset+DIM*1+ZZ] += fiz1;
+        tx                         += fix1;
+        ty                         += fiy1;
+        tz                         += fiz1;
+        f[i_coord_offset+DIM*2+XX] += fix2;
+        f[i_coord_offset+DIM*2+YY] += fiy2;
+        f[i_coord_offset+DIM*2+ZZ] += fiz2;
+        tx                         += fix2;
+        ty                         += fiy2;
+        tz                         += fiz2;
+        f[i_coord_offset+DIM*3+XX] += fix3;
+        f[i_coord_offset+DIM*3+YY] += fiy3;
+        f[i_coord_offset+DIM*3+ZZ] += fiz3;
+        tx                         += fix3;
+        ty                         += fiy3;
+        tz                         += fiz3;
+        fshift[i_shift_offset+XX]  += tx;
+        fshift[i_shift_offset+YY]  += ty;
+        fshift[i_shift_offset+ZZ]  += tz;
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        kernel_data->energygrp_elec[ggid] += velecsum;
+        kernel_data->energygrp_vdw[ggid] += vvdwsum;
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 41 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*41 + inneriter*424);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_c
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_c
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    int              i_shift_offset,i_coord_offset,j_coord_offset;
+    int              j_index_start,j_index_end;
+    int              nri,inr,ggid,iidx,jidx,jnr,outeriter,inneriter;
+    real             shX,shY,shZ,tx,ty,tz,fscal,rcutoff,rcutoff2;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             *shiftvec,*fshift,*x,*f;
+    int              vdwioffset0;
+    real             ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    real             ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    real             ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    real             ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0;
+    real             jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1;
+    real             jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2;
+    real             jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3;
+    real             jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    real             dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00,cexp1_00,cexp2_00;
+    real             dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11,cexp1_11,cexp2_11;
+    real             dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12,cexp1_12,cexp2_12;
+    real             dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13,cexp1_13,cexp2_13;
+    real             dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21,cexp1_21,cexp2_21;
+    real             dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22,cexp1_22,cexp2_22;
+    real             dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23,cexp1_23,cexp2_23;
+    real             dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31,cexp1_31,cexp2_31;
+    real             dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32,cexp1_32,cexp2_32;
+    real             dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33,cexp1_33,cexp2_33;
+    real             velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    real             rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,br,vvdwexp,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    real             c6grid_00;
+    real             c6grid_11;
+    real             c6grid_12;
+    real             c6grid_13;
+    real             c6grid_21;
+    real             c6grid_22;
+    real             c6grid_23;
+    real             c6grid_31;
+    real             c6grid_32;
+    real             c6grid_33;
+    real             ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,sh_lj_ewald;
+    real            *vdwgridparam;
+    int              ewitab;
+    real             ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace;
+    real             *ewtab;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = fr->epsfac;
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    ewclj            = fr->ewaldcoeff_lj;
+    sh_lj_ewald             = fr->ic->sh_lj_ewald;
+    ewclj2           = ewclj*ewclj;
+    ewclj6           = ewclj2*ewclj2*ewclj2;
+
+    sh_ewald         = fr->ic->sh_ewald;
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = fr->ic->tabq_scale;
+    ewtabhalfspace   = 0.5/ewtabscale;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = facel*charge[inr+1];
+    iq2              = facel*charge[inr+2];
+    iq3              = facel*charge[inr+3];
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = charge[inr+1];
+    jq2              = charge[inr+2];
+    jq3              = charge[inr+3];
+    vdwjidx0         = 2*vdwtype[inr+0];
+    c6_00            = vdwparam[vdwioffset0+vdwjidx0];
+    c12_00           = vdwparam[vdwioffset0+vdwjidx0+1];
+    c6grid_00        = vdwgridparam[vdwioffset0+vdwjidx0];
+    qq11             = iq1*jq1;
+    qq12             = iq1*jq2;
+    qq13             = iq1*jq3;
+    qq21             = iq2*jq1;
+    qq22             = iq2*jq2;
+    qq23             = iq2*jq3;
+    qq31             = iq3*jq1;
+    qq32             = iq3*jq2;
+    qq33             = iq3*jq3;
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff          = fr->rcoulomb;
+    rcutoff2         = rcutoff*rcutoff;
+
+    sh_vdw_invrcut6  = fr->ic->sh_invrc6;
+    rvdw             = fr->rvdw;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = shX + x[i_coord_offset+DIM*0+XX];
+        iy0              = shY + x[i_coord_offset+DIM*0+YY];
+        iz0              = shZ + x[i_coord_offset+DIM*0+ZZ];
+        ix1              = shX + x[i_coord_offset+DIM*1+XX];
+        iy1              = shY + x[i_coord_offset+DIM*1+YY];
+        iz1              = shZ + x[i_coord_offset+DIM*1+ZZ];
+        ix2              = shX + x[i_coord_offset+DIM*2+XX];
+        iy2              = shY + x[i_coord_offset+DIM*2+YY];
+        iz2              = shZ + x[i_coord_offset+DIM*2+ZZ];
+        ix3              = shX + x[i_coord_offset+DIM*3+XX];
+        iy3              = shY + x[i_coord_offset+DIM*3+YY];
+        iz3              = shZ + x[i_coord_offset+DIM*3+ZZ];
+
+        fix0             = 0.0;
+        fiy0             = 0.0;
+        fiz0             = 0.0;
+        fix1             = 0.0;
+        fiy1             = 0.0;
+        fiz1             = 0.0;
+        fix2             = 0.0;
+        fiy2             = 0.0;
+        fiz2             = 0.0;
+        fix3             = 0.0;
+        fiy3             = 0.0;
+        fiz3             = 0.0;
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end; jidx++)
+        {
+            /* Get j neighbor index, and coordinate index */
+            jnr              = jjnr[jidx];
+            j_coord_offset   = DIM*jnr;
+
+            /* load j atom coordinates */
+            jx0              = x[j_coord_offset+DIM*0+XX];
+            jy0              = x[j_coord_offset+DIM*0+YY];
+            jz0              = x[j_coord_offset+DIM*0+ZZ];
+            jx1              = x[j_coord_offset+DIM*1+XX];
+            jy1              = x[j_coord_offset+DIM*1+YY];
+            jz1              = x[j_coord_offset+DIM*1+ZZ];
+            jx2              = x[j_coord_offset+DIM*2+XX];
+            jy2              = x[j_coord_offset+DIM*2+YY];
+            jz2              = x[j_coord_offset+DIM*2+ZZ];
+            jx3              = x[j_coord_offset+DIM*3+XX];
+            jy3              = x[j_coord_offset+DIM*3+YY];
+            jz3              = x[j_coord_offset+DIM*3+ZZ];
+
+            /* Calculate displacement vector */
+            dx00             = ix0 - jx0;
+            dy00             = iy0 - jy0;
+            dz00             = iz0 - jz0;
+            dx11             = ix1 - jx1;
+            dy11             = iy1 - jy1;
+            dz11             = iz1 - jz1;
+            dx12             = ix1 - jx2;
+            dy12             = iy1 - jy2;
+            dz12             = iz1 - jz2;
+            dx13             = ix1 - jx3;
+            dy13             = iy1 - jy3;
+            dz13             = iz1 - jz3;
+            dx21             = ix2 - jx1;
+            dy21             = iy2 - jy1;
+            dz21             = iz2 - jz1;
+            dx22             = ix2 - jx2;
+            dy22             = iy2 - jy2;
+            dz22             = iz2 - jz2;
+            dx23             = ix2 - jx3;
+            dy23             = iy2 - jy3;
+            dz23             = iz2 - jz3;
+            dx31             = ix3 - jx1;
+            dy31             = iy3 - jy1;
+            dz31             = iz3 - jz1;
+            dx32             = ix3 - jx2;
+            dy32             = iy3 - jy2;
+            dz32             = iz3 - jz2;
+            dx33             = ix3 - jx3;
+            dy33             = iy3 - jy3;
+            dz33             = iz3 - jz3;
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = dx00*dx00+dy00*dy00+dz00*dz00;
+            rsq11            = dx11*dx11+dy11*dy11+dz11*dz11;
+            rsq12            = dx12*dx12+dy12*dy12+dz12*dz12;
+            rsq13            = dx13*dx13+dy13*dy13+dz13*dz13;
+            rsq21            = dx21*dx21+dy21*dy21+dz21*dz21;
+            rsq22            = dx22*dx22+dy22*dy22+dz22*dz22;
+            rsq23            = dx23*dx23+dy23*dy23+dz23*dz23;
+            rsq31            = dx31*dx31+dy31*dy31+dz31*dz31;
+            rsq32            = dx32*dx32+dy32*dy32+dz32*dz32;
+            rsq33            = dx33*dx33+dy33*dy33+dz33*dz33;
+
+            rinv00           = gmx_invsqrt(rsq00);
+            rinv11           = gmx_invsqrt(rsq11);
+            rinv12           = gmx_invsqrt(rsq12);
+            rinv13           = gmx_invsqrt(rsq13);
+            rinv21           = gmx_invsqrt(rsq21);
+            rinv22           = gmx_invsqrt(rsq22);
+            rinv23           = gmx_invsqrt(rsq23);
+            rinv31           = gmx_invsqrt(rsq31);
+            rinv32           = gmx_invsqrt(rsq32);
+            rinv33           = gmx_invsqrt(rsq33);
+
+            rinvsq00         = rinv00*rinv00;
+            rinvsq11         = rinv11*rinv11;
+            rinvsq12         = rinv12*rinv12;
+            rinvsq13         = rinv13*rinv13;
+            rinvsq21         = rinv21*rinv21;
+            rinvsq22         = rinv22*rinv22;
+            rinvsq23         = rinv23*rinv23;
+            rinvsq31         = rinv31*rinv31;
+            rinvsq32         = rinv32*rinv32;
+            rinvsq33         = rinv33*rinv33;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq00<rcutoff2)
+            {
+
+            r00              = rsq00*rinv00;
+
+            rinvsix          = rinvsq00*rinvsq00*rinvsq00;
+            ewcljrsq         = ewclj2*rsq00;
+            exponent        = exp(-ewcljrsq);
+            poly             = exponent*(1.0 + ewcljrsq + ewcljrsq*ewcljrsq*0.5);
+            fvdw             = (((c12_00*rinvsix - c6_00 + c6grid_00*(1.0-poly))*rinvsix) - c6grid_00*(1.0/6.0)*exponent*ewclj6)*rinvsq00;
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx00;
+            ty               = fscal*dy00;
+            tz               = fscal*dz00;
+
+            /* Update vectorial force */
+            fix0            += tx;
+            fiy0            += ty;
+            fiz0            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq11<rcutoff2)
+            {
+
+            r11              = rsq11*rinv11;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r11*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq11*rinv11*(rinvsq11-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx11;
+            ty               = fscal*dy11;
+            tz               = fscal*dz11;
+
+            /* Update vectorial force */
+            fix1            += tx;
+            fiy1            += ty;
+            fiz1            += tz;
+            f[j_coord_offset+DIM*1+XX] -= tx;
+            f[j_coord_offset+DIM*1+YY] -= ty;
+            f[j_coord_offset+DIM*1+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq12<rcutoff2)
+            {
+
+            r12              = rsq12*rinv12;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r12*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq12*rinv12*(rinvsq12-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx12;
+            ty               = fscal*dy12;
+            tz               = fscal*dz12;
+
+            /* Update vectorial force */
+            fix1            += tx;
+            fiy1            += ty;
+            fiz1            += tz;
+            f[j_coord_offset+DIM*2+XX] -= tx;
+            f[j_coord_offset+DIM*2+YY] -= ty;
+            f[j_coord_offset+DIM*2+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq13<rcutoff2)
+            {
+
+            r13              = rsq13*rinv13;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r13*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq13*rinv13*(rinvsq13-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx13;
+            ty               = fscal*dy13;
+            tz               = fscal*dz13;
+
+            /* Update vectorial force */
+            fix1            += tx;
+            fiy1            += ty;
+            fiz1            += tz;
+            f[j_coord_offset+DIM*3+XX] -= tx;
+            f[j_coord_offset+DIM*3+YY] -= ty;
+            f[j_coord_offset+DIM*3+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq21<rcutoff2)
+            {
+
+            r21              = rsq21*rinv21;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r21*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq21*rinv21*(rinvsq21-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx21;
+            ty               = fscal*dy21;
+            tz               = fscal*dz21;
+
+            /* Update vectorial force */
+            fix2            += tx;
+            fiy2            += ty;
+            fiz2            += tz;
+            f[j_coord_offset+DIM*1+XX] -= tx;
+            f[j_coord_offset+DIM*1+YY] -= ty;
+            f[j_coord_offset+DIM*1+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq22<rcutoff2)
+            {
+
+            r22              = rsq22*rinv22;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r22*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq22*rinv22*(rinvsq22-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx22;
+            ty               = fscal*dy22;
+            tz               = fscal*dz22;
+
+            /* Update vectorial force */
+            fix2            += tx;
+            fiy2            += ty;
+            fiz2            += tz;
+            f[j_coord_offset+DIM*2+XX] -= tx;
+            f[j_coord_offset+DIM*2+YY] -= ty;
+            f[j_coord_offset+DIM*2+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq23<rcutoff2)
+            {
+
+            r23              = rsq23*rinv23;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r23*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq23*rinv23*(rinvsq23-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx23;
+            ty               = fscal*dy23;
+            tz               = fscal*dz23;
+
+            /* Update vectorial force */
+            fix2            += tx;
+            fiy2            += ty;
+            fiz2            += tz;
+            f[j_coord_offset+DIM*3+XX] -= tx;
+            f[j_coord_offset+DIM*3+YY] -= ty;
+            f[j_coord_offset+DIM*3+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq31<rcutoff2)
+            {
+
+            r31              = rsq31*rinv31;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r31*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq31*rinv31*(rinvsq31-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx31;
+            ty               = fscal*dy31;
+            tz               = fscal*dz31;
+
+            /* Update vectorial force */
+            fix3            += tx;
+            fiy3            += ty;
+            fiz3            += tz;
+            f[j_coord_offset+DIM*1+XX] -= tx;
+            f[j_coord_offset+DIM*1+YY] -= ty;
+            f[j_coord_offset+DIM*1+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq32<rcutoff2)
+            {
+
+            r32              = rsq32*rinv32;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r32*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq32*rinv32*(rinvsq32-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx32;
+            ty               = fscal*dy32;
+            tz               = fscal*dz32;
+
+            /* Update vectorial force */
+            fix3            += tx;
+            fiy3            += ty;
+            fiz3            += tz;
+            f[j_coord_offset+DIM*2+XX] -= tx;
+            f[j_coord_offset+DIM*2+YY] -= ty;
+            f[j_coord_offset+DIM*2+ZZ] -= tz;
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq33<rcutoff2)
+            {
+
+            r33              = rsq33*rinv33;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r33*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq33*rinv33*(rinvsq33-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx33;
+            ty               = fscal*dy33;
+            tz               = fscal*dz33;
+
+            /* Update vectorial force */
+            fix3            += tx;
+            fiy3            += ty;
+            fiz3            += tz;
+            f[j_coord_offset+DIM*3+XX] -= tx;
+            f[j_coord_offset+DIM*3+YY] -= ty;
+            f[j_coord_offset+DIM*3+ZZ] -= tz;
+
+            }
+
+            /* Inner loop uses 341 flops */
+        }
+        /* End of innermost loop */
+
+        tx = ty = tz = 0;
+        f[i_coord_offset+DIM*0+XX] += fix0;
+        f[i_coord_offset+DIM*0+YY] += fiy0;
+        f[i_coord_offset+DIM*0+ZZ] += fiz0;
+        tx                         += fix0;
+        ty                         += fiy0;
+        tz                         += fiz0;
+        f[i_coord_offset+DIM*1+XX] += fix1;
+        f[i_coord_offset+DIM*1+YY] += fiy1;
+        f[i_coord_offset+DIM*1+ZZ] += fiz1;
+        tx                         += fix1;
+        ty                         += fiy1;
+        tz                         += fiz1;
+        f[i_coord_offset+DIM*2+XX] += fix2;
+        f[i_coord_offset+DIM*2+YY] += fiy2;
+        f[i_coord_offset+DIM*2+ZZ] += fiz2;
+        tx                         += fix2;
+        ty                         += fiy2;
+        tz                         += fiz2;
+        f[i_coord_offset+DIM*3+XX] += fix3;
+        f[i_coord_offset+DIM*3+YY] += fiy3;
+        f[i_coord_offset+DIM*3+ZZ] += fiz3;
+        tx                         += fix3;
+        ty                         += fiy3;
+        tz                         += fiz3;
+        fshift[i_shift_offset+XX]  += tx;
+        fshift[i_shift_offset+YY]  += ty;
+        fshift[i_shift_offset+ZZ]  += tz;
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 39 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*39 + inneriter*341);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecEw_VdwLJEw_GeomP1P1_c.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecEw_VdwLJEw_GeomP1P1_c.c
new file mode 100644 (file)
index 0000000..4c5bad5
--- /dev/null
@@ -0,0 +1,462 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS c kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_c
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_c
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    int              i_shift_offset,i_coord_offset,j_coord_offset;
+    int              j_index_start,j_index_end;
+    int              nri,inr,ggid,iidx,jidx,jnr,outeriter,inneriter;
+    real             shX,shY,shZ,tx,ty,tz,fscal,rcutoff,rcutoff2;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             *shiftvec,*fshift,*x,*f;
+    int              vdwioffset0;
+    real             ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0;
+    real             jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    real             dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00,cexp1_00,cexp2_00;
+    real             velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    real             rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,br,vvdwexp,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    real             c6grid_00;
+    real             ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,sh_lj_ewald;
+    real            *vdwgridparam;
+    int              ewitab;
+    real             ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace;
+    real             *ewtab;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = fr->epsfac;
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    ewclj            = fr->ewaldcoeff_lj;
+    sh_lj_ewald             = fr->ic->sh_lj_ewald;
+    ewclj2           = ewclj*ewclj;
+    ewclj6           = ewclj2*ewclj2*ewclj2;
+
+    sh_ewald         = fr->ic->sh_ewald;
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = fr->ic->tabq_scale;
+    ewtabhalfspace   = 0.5/ewtabscale;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = shX + x[i_coord_offset+DIM*0+XX];
+        iy0              = shY + x[i_coord_offset+DIM*0+YY];
+        iz0              = shZ + x[i_coord_offset+DIM*0+ZZ];
+
+        fix0             = 0.0;
+        fiy0             = 0.0;
+        fiz0             = 0.0;
+
+        /* Load parameters for i particles */
+        iq0              = facel*charge[inr+0];
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = 0.0;
+        vvdwsum          = 0.0;
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end; jidx++)
+        {
+            /* Get j neighbor index, and coordinate index */
+            jnr              = jjnr[jidx];
+            j_coord_offset   = DIM*jnr;
+
+            /* load j atom coordinates */
+            jx0              = x[j_coord_offset+DIM*0+XX];
+            jy0              = x[j_coord_offset+DIM*0+YY];
+            jz0              = x[j_coord_offset+DIM*0+ZZ];
+
+            /* Calculate displacement vector */
+            dx00             = ix0 - jx0;
+            dy00             = iy0 - jy0;
+            dz00             = iz0 - jz0;
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = dx00*dx00+dy00*dy00+dz00*dz00;
+
+            rinv00           = gmx_invsqrt(rsq00);
+
+            rinvsq00         = rinv00*rinv00;
+
+            /* Load parameters for j particles */
+            jq0              = charge[jnr+0];
+            vdwjidx0         = 2*vdwtype[jnr+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = rsq00*rinv00;
+
+            qq00             = iq0*jq0;
+            c6_00            = vdwparam[vdwioffset0+vdwjidx0];
+            c12_00           = vdwparam[vdwioffset0+vdwjidx0+1];
+            c6grid_00        = vdwgridparam[vdwioffset0+vdwjidx0];
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r00*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq00*(rinv00-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq00*rinv00*(rinvsq00-felec);
+
+            rinvsix          = rinvsq00*rinvsq00*rinvsq00;
+            ewcljrsq         = ewclj2*rsq00;
+            exponent        = exp(-ewcljrsq);
+            poly             = exponent*(1.0 + ewcljrsq + ewcljrsq*ewcljrsq*0.5);
+            vvdw6            = (c6_00-c6grid_00*(1.0-poly))*rinvsix;
+            vvdw12           = c12_00*rinvsix*rinvsix;
+            vvdw             = vvdw12*(1.0/12.0) - vvdw6*(1.0/6.0);
+            fvdw             = (vvdw12 - vvdw6 - c6grid_00*(1.0/6.0)*exponent*ewclj6)*rinvsq00;
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+            vvdwsum         += vvdw;
+
+            fscal            = felec+fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx00;
+            ty               = fscal*dy00;
+            tz               = fscal*dz00;
+
+            /* Update vectorial force */
+            fix0            += tx;
+            fiy0            += ty;
+            fiz0            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            /* Inner loop uses 67 flops */
+        }
+        /* End of innermost loop */
+
+        tx = ty = tz = 0;
+        f[i_coord_offset+DIM*0+XX] += fix0;
+        f[i_coord_offset+DIM*0+YY] += fiy0;
+        f[i_coord_offset+DIM*0+ZZ] += fiz0;
+        tx                         += fix0;
+        ty                         += fiy0;
+        tz                         += fiz0;
+        fshift[i_shift_offset+XX]  += tx;
+        fshift[i_shift_offset+YY]  += ty;
+        fshift[i_shift_offset+ZZ]  += tz;
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        kernel_data->energygrp_elec[ggid] += velecsum;
+        kernel_data->energygrp_vdw[ggid] += vvdwsum;
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 15 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*15 + inneriter*67);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_c
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_c
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    int              i_shift_offset,i_coord_offset,j_coord_offset;
+    int              j_index_start,j_index_end;
+    int              nri,inr,ggid,iidx,jidx,jnr,outeriter,inneriter;
+    real             shX,shY,shZ,tx,ty,tz,fscal,rcutoff,rcutoff2;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             *shiftvec,*fshift,*x,*f;
+    int              vdwioffset0;
+    real             ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0;
+    real             jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    real             dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00,cexp1_00,cexp2_00;
+    real             velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    real             rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,br,vvdwexp,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    real             c6grid_00;
+    real             ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,sh_lj_ewald;
+    real            *vdwgridparam;
+    int              ewitab;
+    real             ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace;
+    real             *ewtab;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = fr->epsfac;
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    ewclj            = fr->ewaldcoeff_lj;
+    sh_lj_ewald             = fr->ic->sh_lj_ewald;
+    ewclj2           = ewclj*ewclj;
+    ewclj6           = ewclj2*ewclj2*ewclj2;
+
+    sh_ewald         = fr->ic->sh_ewald;
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = fr->ic->tabq_scale;
+    ewtabhalfspace   = 0.5/ewtabscale;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = shX + x[i_coord_offset+DIM*0+XX];
+        iy0              = shY + x[i_coord_offset+DIM*0+YY];
+        iz0              = shZ + x[i_coord_offset+DIM*0+ZZ];
+
+        fix0             = 0.0;
+        fiy0             = 0.0;
+        fiz0             = 0.0;
+
+        /* Load parameters for i particles */
+        iq0              = facel*charge[inr+0];
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end; jidx++)
+        {
+            /* Get j neighbor index, and coordinate index */
+            jnr              = jjnr[jidx];
+            j_coord_offset   = DIM*jnr;
+
+            /* load j atom coordinates */
+            jx0              = x[j_coord_offset+DIM*0+XX];
+            jy0              = x[j_coord_offset+DIM*0+YY];
+            jz0              = x[j_coord_offset+DIM*0+ZZ];
+
+            /* Calculate displacement vector */
+            dx00             = ix0 - jx0;
+            dy00             = iy0 - jy0;
+            dz00             = iz0 - jz0;
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = dx00*dx00+dy00*dy00+dz00*dz00;
+
+            rinv00           = gmx_invsqrt(rsq00);
+
+            rinvsq00         = rinv00*rinv00;
+
+            /* Load parameters for j particles */
+            jq0              = charge[jnr+0];
+            vdwjidx0         = 2*vdwtype[jnr+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = rsq00*rinv00;
+
+            qq00             = iq0*jq0;
+            c6_00            = vdwparam[vdwioffset0+vdwjidx0];
+            c12_00           = vdwparam[vdwioffset0+vdwjidx0+1];
+            c6grid_00        = vdwgridparam[vdwioffset0+vdwjidx0];
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r00*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq00*rinv00*(rinvsq00-felec);
+
+            rinvsix          = rinvsq00*rinvsq00*rinvsq00;
+            ewcljrsq         = ewclj2*rsq00;
+            exponent        = exp(-ewcljrsq);
+            poly             = exponent*(1.0 + ewcljrsq + ewcljrsq*ewcljrsq*0.5);
+            fvdw             = (((c12_00*rinvsix - c6_00 + c6grid_00*(1.0-poly))*rinvsix) - c6grid_00*(1.0/6.0)*exponent*ewclj6)*rinvsq00;
+
+            fscal            = felec+fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx00;
+            ty               = fscal*dy00;
+            tz               = fscal*dz00;
+
+            /* Update vectorial force */
+            fix0            += tx;
+            fiy0            += ty;
+            fiz0            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            /* Inner loop uses 55 flops */
+        }
+        /* End of innermost loop */
+
+        tx = ty = tz = 0;
+        f[i_coord_offset+DIM*0+XX] += fix0;
+        f[i_coord_offset+DIM*0+YY] += fiy0;
+        f[i_coord_offset+DIM*0+ZZ] += fiz0;
+        tx                         += fix0;
+        ty                         += fiy0;
+        tz                         += fiz0;
+        fshift[i_shift_offset+XX]  += tx;
+        fshift[i_shift_offset+YY]  += ty;
+        fshift[i_shift_offset+ZZ]  += tz;
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 13 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*13 + inneriter*55);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecEw_VdwLJEw_GeomW3P1_c.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecEw_VdwLJEw_GeomW3P1_c.c
new file mode 100644 (file)
index 0000000..130bd66
--- /dev/null
@@ -0,0 +1,694 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS c kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_c
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_c
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    int              i_shift_offset,i_coord_offset,j_coord_offset;
+    int              j_index_start,j_index_end;
+    int              nri,inr,ggid,iidx,jidx,jnr,outeriter,inneriter;
+    real             shX,shY,shZ,tx,ty,tz,fscal,rcutoff,rcutoff2;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             *shiftvec,*fshift,*x,*f;
+    int              vdwioffset0;
+    real             ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    real             ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    real             ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0;
+    real             jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    real             dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00,cexp1_00,cexp2_00;
+    real             dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10,cexp1_10,cexp2_10;
+    real             dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20,cexp1_20,cexp2_20;
+    real             velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    real             rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,br,vvdwexp,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    real             c6grid_00;
+    real             c6grid_10;
+    real             c6grid_20;
+    real             ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,sh_lj_ewald;
+    real            *vdwgridparam;
+    int              ewitab;
+    real             ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace;
+    real             *ewtab;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = fr->epsfac;
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    ewclj            = fr->ewaldcoeff_lj;
+    sh_lj_ewald             = fr->ic->sh_lj_ewald;
+    ewclj2           = ewclj*ewclj;
+    ewclj6           = ewclj2*ewclj2*ewclj2;
+
+    sh_ewald         = fr->ic->sh_ewald;
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = fr->ic->tabq_scale;
+    ewtabhalfspace   = 0.5/ewtabscale;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = facel*charge[inr+0];
+    iq1              = facel*charge[inr+1];
+    iq2              = facel*charge[inr+2];
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = shX + x[i_coord_offset+DIM*0+XX];
+        iy0              = shY + x[i_coord_offset+DIM*0+YY];
+        iz0              = shZ + x[i_coord_offset+DIM*0+ZZ];
+        ix1              = shX + x[i_coord_offset+DIM*1+XX];
+        iy1              = shY + x[i_coord_offset+DIM*1+YY];
+        iz1              = shZ + x[i_coord_offset+DIM*1+ZZ];
+        ix2              = shX + x[i_coord_offset+DIM*2+XX];
+        iy2              = shY + x[i_coord_offset+DIM*2+YY];
+        iz2              = shZ + x[i_coord_offset+DIM*2+ZZ];
+
+        fix0             = 0.0;
+        fiy0             = 0.0;
+        fiz0             = 0.0;
+        fix1             = 0.0;
+        fiy1             = 0.0;
+        fiz1             = 0.0;
+        fix2             = 0.0;
+        fiy2             = 0.0;
+        fiz2             = 0.0;
+
+        /* Reset potential sums */
+        velecsum         = 0.0;
+        vvdwsum          = 0.0;
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end; jidx++)
+        {
+            /* Get j neighbor index, and coordinate index */
+            jnr              = jjnr[jidx];
+            j_coord_offset   = DIM*jnr;
+
+            /* load j atom coordinates */
+            jx0              = x[j_coord_offset+DIM*0+XX];
+            jy0              = x[j_coord_offset+DIM*0+YY];
+            jz0              = x[j_coord_offset+DIM*0+ZZ];
+
+            /* Calculate displacement vector */
+            dx00             = ix0 - jx0;
+            dy00             = iy0 - jy0;
+            dz00             = iz0 - jz0;
+            dx10             = ix1 - jx0;
+            dy10             = iy1 - jy0;
+            dz10             = iz1 - jz0;
+            dx20             = ix2 - jx0;
+            dy20             = iy2 - jy0;
+            dz20             = iz2 - jz0;
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = dx00*dx00+dy00*dy00+dz00*dz00;
+            rsq10            = dx10*dx10+dy10*dy10+dz10*dz10;
+            rsq20            = dx20*dx20+dy20*dy20+dz20*dz20;
+
+            rinv00           = gmx_invsqrt(rsq00);
+            rinv10           = gmx_invsqrt(rsq10);
+            rinv20           = gmx_invsqrt(rsq20);
+
+            rinvsq00         = rinv00*rinv00;
+            rinvsq10         = rinv10*rinv10;
+            rinvsq20         = rinv20*rinv20;
+
+            /* Load parameters for j particles */
+            jq0              = charge[jnr+0];
+            vdwjidx0         = 2*vdwtype[jnr+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = rsq00*rinv00;
+
+            qq00             = iq0*jq0;
+            c6_00            = vdwparam[vdwioffset0+vdwjidx0];
+            c12_00           = vdwparam[vdwioffset0+vdwjidx0+1];
+            c6grid_00        = vdwgridparam[vdwioffset0+vdwjidx0];
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r00*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq00*(rinv00-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq00*rinv00*(rinvsq00-felec);
+
+            rinvsix          = rinvsq00*rinvsq00*rinvsq00;
+            ewcljrsq         = ewclj2*rsq00;
+            exponent        = exp(-ewcljrsq);
+            poly             = exponent*(1.0 + ewcljrsq + ewcljrsq*ewcljrsq*0.5);
+            vvdw6            = (c6_00-c6grid_00*(1.0-poly))*rinvsix;
+            vvdw12           = c12_00*rinvsix*rinvsix;
+            vvdw             = vvdw12*(1.0/12.0) - vvdw6*(1.0/6.0);
+            fvdw             = (vvdw12 - vvdw6 - c6grid_00*(1.0/6.0)*exponent*ewclj6)*rinvsq00;
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+            vvdwsum         += vvdw;
+
+            fscal            = felec+fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx00;
+            ty               = fscal*dy00;
+            tz               = fscal*dz00;
+
+            /* Update vectorial force */
+            fix0            += tx;
+            fiy0            += ty;
+            fiz0            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = rsq10*rinv10;
+
+            qq10             = iq1*jq0;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r10*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq10*(rinv10-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq10*rinv10*(rinvsq10-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx10;
+            ty               = fscal*dy10;
+            tz               = fscal*dz10;
+
+            /* Update vectorial force */
+            fix1            += tx;
+            fiy1            += ty;
+            fiz1            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = rsq20*rinv20;
+
+            qq20             = iq2*jq0;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r20*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq20*(rinv20-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq20*rinv20*(rinvsq20-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx20;
+            ty               = fscal*dy20;
+            tz               = fscal*dz20;
+
+            /* Update vectorial force */
+            fix2            += tx;
+            fiy2            += ty;
+            fiz2            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            /* Inner loop uses 149 flops */
+        }
+        /* End of innermost loop */
+
+        tx = ty = tz = 0;
+        f[i_coord_offset+DIM*0+XX] += fix0;
+        f[i_coord_offset+DIM*0+YY] += fiy0;
+        f[i_coord_offset+DIM*0+ZZ] += fiz0;
+        tx                         += fix0;
+        ty                         += fiy0;
+        tz                         += fiz0;
+        f[i_coord_offset+DIM*1+XX] += fix1;
+        f[i_coord_offset+DIM*1+YY] += fiy1;
+        f[i_coord_offset+DIM*1+ZZ] += fiz1;
+        tx                         += fix1;
+        ty                         += fiy1;
+        tz                         += fiz1;
+        f[i_coord_offset+DIM*2+XX] += fix2;
+        f[i_coord_offset+DIM*2+YY] += fiy2;
+        f[i_coord_offset+DIM*2+ZZ] += fiz2;
+        tx                         += fix2;
+        ty                         += fiy2;
+        tz                         += fiz2;
+        fshift[i_shift_offset+XX]  += tx;
+        fshift[i_shift_offset+YY]  += ty;
+        fshift[i_shift_offset+ZZ]  += tz;
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        kernel_data->energygrp_elec[ggid] += velecsum;
+        kernel_data->energygrp_vdw[ggid] += vvdwsum;
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 32 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*32 + inneriter*149);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_c
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_c
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    int              i_shift_offset,i_coord_offset,j_coord_offset;
+    int              j_index_start,j_index_end;
+    int              nri,inr,ggid,iidx,jidx,jnr,outeriter,inneriter;
+    real             shX,shY,shZ,tx,ty,tz,fscal,rcutoff,rcutoff2;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             *shiftvec,*fshift,*x,*f;
+    int              vdwioffset0;
+    real             ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    real             ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    real             ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0;
+    real             jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    real             dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00,cexp1_00,cexp2_00;
+    real             dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10,cexp1_10,cexp2_10;
+    real             dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20,cexp1_20,cexp2_20;
+    real             velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    real             rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,br,vvdwexp,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    real             c6grid_00;
+    real             c6grid_10;
+    real             c6grid_20;
+    real             ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,sh_lj_ewald;
+    real            *vdwgridparam;
+    int              ewitab;
+    real             ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace;
+    real             *ewtab;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = fr->epsfac;
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    ewclj            = fr->ewaldcoeff_lj;
+    sh_lj_ewald             = fr->ic->sh_lj_ewald;
+    ewclj2           = ewclj*ewclj;
+    ewclj6           = ewclj2*ewclj2*ewclj2;
+
+    sh_ewald         = fr->ic->sh_ewald;
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = fr->ic->tabq_scale;
+    ewtabhalfspace   = 0.5/ewtabscale;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = facel*charge[inr+0];
+    iq1              = facel*charge[inr+1];
+    iq2              = facel*charge[inr+2];
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = shX + x[i_coord_offset+DIM*0+XX];
+        iy0              = shY + x[i_coord_offset+DIM*0+YY];
+        iz0              = shZ + x[i_coord_offset+DIM*0+ZZ];
+        ix1              = shX + x[i_coord_offset+DIM*1+XX];
+        iy1              = shY + x[i_coord_offset+DIM*1+YY];
+        iz1              = shZ + x[i_coord_offset+DIM*1+ZZ];
+        ix2              = shX + x[i_coord_offset+DIM*2+XX];
+        iy2              = shY + x[i_coord_offset+DIM*2+YY];
+        iz2              = shZ + x[i_coord_offset+DIM*2+ZZ];
+
+        fix0             = 0.0;
+        fiy0             = 0.0;
+        fiz0             = 0.0;
+        fix1             = 0.0;
+        fiy1             = 0.0;
+        fiz1             = 0.0;
+        fix2             = 0.0;
+        fiy2             = 0.0;
+        fiz2             = 0.0;
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end; jidx++)
+        {
+            /* Get j neighbor index, and coordinate index */
+            jnr              = jjnr[jidx];
+            j_coord_offset   = DIM*jnr;
+
+            /* load j atom coordinates */
+            jx0              = x[j_coord_offset+DIM*0+XX];
+            jy0              = x[j_coord_offset+DIM*0+YY];
+            jz0              = x[j_coord_offset+DIM*0+ZZ];
+
+            /* Calculate displacement vector */
+            dx00             = ix0 - jx0;
+            dy00             = iy0 - jy0;
+            dz00             = iz0 - jz0;
+            dx10             = ix1 - jx0;
+            dy10             = iy1 - jy0;
+            dz10             = iz1 - jz0;
+            dx20             = ix2 - jx0;
+            dy20             = iy2 - jy0;
+            dz20             = iz2 - jz0;
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = dx00*dx00+dy00*dy00+dz00*dz00;
+            rsq10            = dx10*dx10+dy10*dy10+dz10*dz10;
+            rsq20            = dx20*dx20+dy20*dy20+dz20*dz20;
+
+            rinv00           = gmx_invsqrt(rsq00);
+            rinv10           = gmx_invsqrt(rsq10);
+            rinv20           = gmx_invsqrt(rsq20);
+
+            rinvsq00         = rinv00*rinv00;
+            rinvsq10         = rinv10*rinv10;
+            rinvsq20         = rinv20*rinv20;
+
+            /* Load parameters for j particles */
+            jq0              = charge[jnr+0];
+            vdwjidx0         = 2*vdwtype[jnr+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = rsq00*rinv00;
+
+            qq00             = iq0*jq0;
+            c6_00            = vdwparam[vdwioffset0+vdwjidx0];
+            c12_00           = vdwparam[vdwioffset0+vdwjidx0+1];
+            c6grid_00        = vdwgridparam[vdwioffset0+vdwjidx0];
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r00*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq00*rinv00*(rinvsq00-felec);
+
+            rinvsix          = rinvsq00*rinvsq00*rinvsq00;
+            ewcljrsq         = ewclj2*rsq00;
+            exponent        = exp(-ewcljrsq);
+            poly             = exponent*(1.0 + ewcljrsq + ewcljrsq*ewcljrsq*0.5);
+            fvdw             = (((c12_00*rinvsix - c6_00 + c6grid_00*(1.0-poly))*rinvsix) - c6grid_00*(1.0/6.0)*exponent*ewclj6)*rinvsq00;
+
+            fscal            = felec+fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx00;
+            ty               = fscal*dy00;
+            tz               = fscal*dz00;
+
+            /* Update vectorial force */
+            fix0            += tx;
+            fiy0            += ty;
+            fiz0            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = rsq10*rinv10;
+
+            qq10             = iq1*jq0;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r10*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq10*rinv10*(rinvsq10-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx10;
+            ty               = fscal*dy10;
+            tz               = fscal*dz10;
+
+            /* Update vectorial force */
+            fix1            += tx;
+            fiy1            += ty;
+            fiz1            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = rsq20*rinv20;
+
+            qq20             = iq2*jq0;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r20*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq20*rinv20*(rinvsq20-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx20;
+            ty               = fscal*dy20;
+            tz               = fscal*dz20;
+
+            /* Update vectorial force */
+            fix2            += tx;
+            fiy2            += ty;
+            fiz2            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            /* Inner loop uses 123 flops */
+        }
+        /* End of innermost loop */
+
+        tx = ty = tz = 0;
+        f[i_coord_offset+DIM*0+XX] += fix0;
+        f[i_coord_offset+DIM*0+YY] += fiy0;
+        f[i_coord_offset+DIM*0+ZZ] += fiz0;
+        tx                         += fix0;
+        ty                         += fiy0;
+        tz                         += fiz0;
+        f[i_coord_offset+DIM*1+XX] += fix1;
+        f[i_coord_offset+DIM*1+YY] += fiy1;
+        f[i_coord_offset+DIM*1+ZZ] += fiz1;
+        tx                         += fix1;
+        ty                         += fiy1;
+        tz                         += fiz1;
+        f[i_coord_offset+DIM*2+XX] += fix2;
+        f[i_coord_offset+DIM*2+YY] += fiy2;
+        f[i_coord_offset+DIM*2+ZZ] += fiz2;
+        tx                         += fix2;
+        ty                         += fiy2;
+        tz                         += fiz2;
+        fshift[i_shift_offset+XX]  += tx;
+        fshift[i_shift_offset+YY]  += ty;
+        fshift[i_shift_offset+ZZ]  += tz;
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 30 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*30 + inneriter*123);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecEw_VdwLJEw_GeomW3W3_c.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecEw_VdwLJEw_GeomW3W3_c.c
new file mode 100644 (file)
index 0000000..30301f6
--- /dev/null
@@ -0,0 +1,1208 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS c kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_c
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_c
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    int              i_shift_offset,i_coord_offset,j_coord_offset;
+    int              j_index_start,j_index_end;
+    int              nri,inr,ggid,iidx,jidx,jnr,outeriter,inneriter;
+    real             shX,shY,shZ,tx,ty,tz,fscal,rcutoff,rcutoff2;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             *shiftvec,*fshift,*x,*f;
+    int              vdwioffset0;
+    real             ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    real             ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    real             ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0;
+    real             jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1;
+    real             jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2;
+    real             jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    real             dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00,cexp1_00,cexp2_00;
+    real             dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01,cexp1_01,cexp2_01;
+    real             dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02,cexp1_02,cexp2_02;
+    real             dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10,cexp1_10,cexp2_10;
+    real             dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11,cexp1_11,cexp2_11;
+    real             dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12,cexp1_12,cexp2_12;
+    real             dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20,cexp1_20,cexp2_20;
+    real             dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21,cexp1_21,cexp2_21;
+    real             dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22,cexp1_22,cexp2_22;
+    real             velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    real             rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,br,vvdwexp,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    real             c6grid_00;
+    real             c6grid_01;
+    real             c6grid_02;
+    real             c6grid_10;
+    real             c6grid_11;
+    real             c6grid_12;
+    real             c6grid_20;
+    real             c6grid_21;
+    real             c6grid_22;
+    real             ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,sh_lj_ewald;
+    real            *vdwgridparam;
+    int              ewitab;
+    real             ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace;
+    real             *ewtab;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = fr->epsfac;
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    ewclj            = fr->ewaldcoeff_lj;
+    sh_lj_ewald             = fr->ic->sh_lj_ewald;
+    ewclj2           = ewclj*ewclj;
+    ewclj6           = ewclj2*ewclj2*ewclj2;
+
+    sh_ewald         = fr->ic->sh_ewald;
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = fr->ic->tabq_scale;
+    ewtabhalfspace   = 0.5/ewtabscale;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = facel*charge[inr+0];
+    iq1              = facel*charge[inr+1];
+    iq2              = facel*charge[inr+2];
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = charge[inr+0];
+    jq1              = charge[inr+1];
+    jq2              = charge[inr+2];
+    vdwjidx0         = 2*vdwtype[inr+0];
+    qq00             = iq0*jq0;
+    c6_00            = vdwparam[vdwioffset0+vdwjidx0];
+    c12_00           = vdwparam[vdwioffset0+vdwjidx0+1];
+    c6grid_00        = vdwgridparam[vdwioffset0+vdwjidx0];
+    qq01             = iq0*jq1;
+    qq02             = iq0*jq2;
+    qq10             = iq1*jq0;
+    qq11             = iq1*jq1;
+    qq12             = iq1*jq2;
+    qq20             = iq2*jq0;
+    qq21             = iq2*jq1;
+    qq22             = iq2*jq2;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = shX + x[i_coord_offset+DIM*0+XX];
+        iy0              = shY + x[i_coord_offset+DIM*0+YY];
+        iz0              = shZ + x[i_coord_offset+DIM*0+ZZ];
+        ix1              = shX + x[i_coord_offset+DIM*1+XX];
+        iy1              = shY + x[i_coord_offset+DIM*1+YY];
+        iz1              = shZ + x[i_coord_offset+DIM*1+ZZ];
+        ix2              = shX + x[i_coord_offset+DIM*2+XX];
+        iy2              = shY + x[i_coord_offset+DIM*2+YY];
+        iz2              = shZ + x[i_coord_offset+DIM*2+ZZ];
+
+        fix0             = 0.0;
+        fiy0             = 0.0;
+        fiz0             = 0.0;
+        fix1             = 0.0;
+        fiy1             = 0.0;
+        fiz1             = 0.0;
+        fix2             = 0.0;
+        fiy2             = 0.0;
+        fiz2             = 0.0;
+
+        /* Reset potential sums */
+        velecsum         = 0.0;
+        vvdwsum          = 0.0;
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end; jidx++)
+        {
+            /* Get j neighbor index, and coordinate index */
+            jnr              = jjnr[jidx];
+            j_coord_offset   = DIM*jnr;
+
+            /* load j atom coordinates */
+            jx0              = x[j_coord_offset+DIM*0+XX];
+            jy0              = x[j_coord_offset+DIM*0+YY];
+            jz0              = x[j_coord_offset+DIM*0+ZZ];
+            jx1              = x[j_coord_offset+DIM*1+XX];
+            jy1              = x[j_coord_offset+DIM*1+YY];
+            jz1              = x[j_coord_offset+DIM*1+ZZ];
+            jx2              = x[j_coord_offset+DIM*2+XX];
+            jy2              = x[j_coord_offset+DIM*2+YY];
+            jz2              = x[j_coord_offset+DIM*2+ZZ];
+
+            /* Calculate displacement vector */
+            dx00             = ix0 - jx0;
+            dy00             = iy0 - jy0;
+            dz00             = iz0 - jz0;
+            dx01             = ix0 - jx1;
+            dy01             = iy0 - jy1;
+            dz01             = iz0 - jz1;
+            dx02             = ix0 - jx2;
+            dy02             = iy0 - jy2;
+            dz02             = iz0 - jz2;
+            dx10             = ix1 - jx0;
+            dy10             = iy1 - jy0;
+            dz10             = iz1 - jz0;
+            dx11             = ix1 - jx1;
+            dy11             = iy1 - jy1;
+            dz11             = iz1 - jz1;
+            dx12             = ix1 - jx2;
+            dy12             = iy1 - jy2;
+            dz12             = iz1 - jz2;
+            dx20             = ix2 - jx0;
+            dy20             = iy2 - jy0;
+            dz20             = iz2 - jz0;
+            dx21             = ix2 - jx1;
+            dy21             = iy2 - jy1;
+            dz21             = iz2 - jz1;
+            dx22             = ix2 - jx2;
+            dy22             = iy2 - jy2;
+            dz22             = iz2 - jz2;
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = dx00*dx00+dy00*dy00+dz00*dz00;
+            rsq01            = dx01*dx01+dy01*dy01+dz01*dz01;
+            rsq02            = dx02*dx02+dy02*dy02+dz02*dz02;
+            rsq10            = dx10*dx10+dy10*dy10+dz10*dz10;
+            rsq11            = dx11*dx11+dy11*dy11+dz11*dz11;
+            rsq12            = dx12*dx12+dy12*dy12+dz12*dz12;
+            rsq20            = dx20*dx20+dy20*dy20+dz20*dz20;
+            rsq21            = dx21*dx21+dy21*dy21+dz21*dz21;
+            rsq22            = dx22*dx22+dy22*dy22+dz22*dz22;
+
+            rinv00           = gmx_invsqrt(rsq00);
+            rinv01           = gmx_invsqrt(rsq01);
+            rinv02           = gmx_invsqrt(rsq02);
+            rinv10           = gmx_invsqrt(rsq10);
+            rinv11           = gmx_invsqrt(rsq11);
+            rinv12           = gmx_invsqrt(rsq12);
+            rinv20           = gmx_invsqrt(rsq20);
+            rinv21           = gmx_invsqrt(rsq21);
+            rinv22           = gmx_invsqrt(rsq22);
+
+            rinvsq00         = rinv00*rinv00;
+            rinvsq01         = rinv01*rinv01;
+            rinvsq02         = rinv02*rinv02;
+            rinvsq10         = rinv10*rinv10;
+            rinvsq11         = rinv11*rinv11;
+            rinvsq12         = rinv12*rinv12;
+            rinvsq20         = rinv20*rinv20;
+            rinvsq21         = rinv21*rinv21;
+            rinvsq22         = rinv22*rinv22;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = rsq00*rinv00;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r00*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq00*(rinv00-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq00*rinv00*(rinvsq00-felec);
+
+            rinvsix          = rinvsq00*rinvsq00*rinvsq00;
+            ewcljrsq         = ewclj2*rsq00;
+            exponent        = exp(-ewcljrsq);
+            poly             = exponent*(1.0 + ewcljrsq + ewcljrsq*ewcljrsq*0.5);
+            vvdw6            = (c6_00-c6grid_00*(1.0-poly))*rinvsix;
+            vvdw12           = c12_00*rinvsix*rinvsix;
+            vvdw             = vvdw12*(1.0/12.0) - vvdw6*(1.0/6.0);
+            fvdw             = (vvdw12 - vvdw6 - c6grid_00*(1.0/6.0)*exponent*ewclj6)*rinvsq00;
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+            vvdwsum         += vvdw;
+
+            fscal            = felec+fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx00;
+            ty               = fscal*dy00;
+            tz               = fscal*dz00;
+
+            /* Update vectorial force */
+            fix0            += tx;
+            fiy0            += ty;
+            fiz0            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = rsq01*rinv01;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r01*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq01*(rinv01-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq01*rinv01*(rinvsq01-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx01;
+            ty               = fscal*dy01;
+            tz               = fscal*dz01;
+
+            /* Update vectorial force */
+            fix0            += tx;
+            fiy0            += ty;
+            fiz0            += tz;
+            f[j_coord_offset+DIM*1+XX] -= tx;
+            f[j_coord_offset+DIM*1+YY] -= ty;
+            f[j_coord_offset+DIM*1+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = rsq02*rinv02;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r02*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq02*(rinv02-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq02*rinv02*(rinvsq02-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx02;
+            ty               = fscal*dy02;
+            tz               = fscal*dz02;
+
+            /* Update vectorial force */
+            fix0            += tx;
+            fiy0            += ty;
+            fiz0            += tz;
+            f[j_coord_offset+DIM*2+XX] -= tx;
+            f[j_coord_offset+DIM*2+YY] -= ty;
+            f[j_coord_offset+DIM*2+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = rsq10*rinv10;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r10*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq10*(rinv10-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq10*rinv10*(rinvsq10-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx10;
+            ty               = fscal*dy10;
+            tz               = fscal*dz10;
+
+            /* Update vectorial force */
+            fix1            += tx;
+            fiy1            += ty;
+            fiz1            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = rsq11*rinv11;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r11*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq11*(rinv11-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq11*rinv11*(rinvsq11-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx11;
+            ty               = fscal*dy11;
+            tz               = fscal*dz11;
+
+            /* Update vectorial force */
+            fix1            += tx;
+            fiy1            += ty;
+            fiz1            += tz;
+            f[j_coord_offset+DIM*1+XX] -= tx;
+            f[j_coord_offset+DIM*1+YY] -= ty;
+            f[j_coord_offset+DIM*1+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = rsq12*rinv12;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r12*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq12*(rinv12-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq12*rinv12*(rinvsq12-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx12;
+            ty               = fscal*dy12;
+            tz               = fscal*dz12;
+
+            /* Update vectorial force */
+            fix1            += tx;
+            fiy1            += ty;
+            fiz1            += tz;
+            f[j_coord_offset+DIM*2+XX] -= tx;
+            f[j_coord_offset+DIM*2+YY] -= ty;
+            f[j_coord_offset+DIM*2+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = rsq20*rinv20;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r20*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq20*(rinv20-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq20*rinv20*(rinvsq20-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx20;
+            ty               = fscal*dy20;
+            tz               = fscal*dz20;
+
+            /* Update vectorial force */
+            fix2            += tx;
+            fiy2            += ty;
+            fiz2            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = rsq21*rinv21;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r21*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq21*(rinv21-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq21*rinv21*(rinvsq21-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx21;
+            ty               = fscal*dy21;
+            tz               = fscal*dz21;
+
+            /* Update vectorial force */
+            fix2            += tx;
+            fiy2            += ty;
+            fiz2            += tz;
+            f[j_coord_offset+DIM*1+XX] -= tx;
+            f[j_coord_offset+DIM*1+YY] -= ty;
+            f[j_coord_offset+DIM*1+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = rsq22*rinv22;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r22*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq22*(rinv22-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq22*rinv22*(rinvsq22-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx22;
+            ty               = fscal*dy22;
+            tz               = fscal*dz22;
+
+            /* Update vectorial force */
+            fix2            += tx;
+            fiy2            += ty;
+            fiz2            += tz;
+            f[j_coord_offset+DIM*2+XX] -= tx;
+            f[j_coord_offset+DIM*2+YY] -= ty;
+            f[j_coord_offset+DIM*2+ZZ] -= tz;
+
+            /* Inner loop uses 386 flops */
+        }
+        /* End of innermost loop */
+
+        tx = ty = tz = 0;
+        f[i_coord_offset+DIM*0+XX] += fix0;
+        f[i_coord_offset+DIM*0+YY] += fiy0;
+        f[i_coord_offset+DIM*0+ZZ] += fiz0;
+        tx                         += fix0;
+        ty                         += fiy0;
+        tz                         += fiz0;
+        f[i_coord_offset+DIM*1+XX] += fix1;
+        f[i_coord_offset+DIM*1+YY] += fiy1;
+        f[i_coord_offset+DIM*1+ZZ] += fiz1;
+        tx                         += fix1;
+        ty                         += fiy1;
+        tz                         += fiz1;
+        f[i_coord_offset+DIM*2+XX] += fix2;
+        f[i_coord_offset+DIM*2+YY] += fiy2;
+        f[i_coord_offset+DIM*2+ZZ] += fiz2;
+        tx                         += fix2;
+        ty                         += fiy2;
+        tz                         += fiz2;
+        fshift[i_shift_offset+XX]  += tx;
+        fshift[i_shift_offset+YY]  += ty;
+        fshift[i_shift_offset+ZZ]  += tz;
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        kernel_data->energygrp_elec[ggid] += velecsum;
+        kernel_data->energygrp_vdw[ggid] += vvdwsum;
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 32 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*32 + inneriter*386);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_c
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_c
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    int              i_shift_offset,i_coord_offset,j_coord_offset;
+    int              j_index_start,j_index_end;
+    int              nri,inr,ggid,iidx,jidx,jnr,outeriter,inneriter;
+    real             shX,shY,shZ,tx,ty,tz,fscal,rcutoff,rcutoff2;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             *shiftvec,*fshift,*x,*f;
+    int              vdwioffset0;
+    real             ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    real             ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    real             ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0;
+    real             jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1;
+    real             jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2;
+    real             jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    real             dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00,cexp1_00,cexp2_00;
+    real             dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01,cexp1_01,cexp2_01;
+    real             dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02,cexp1_02,cexp2_02;
+    real             dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10,cexp1_10,cexp2_10;
+    real             dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11,cexp1_11,cexp2_11;
+    real             dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12,cexp1_12,cexp2_12;
+    real             dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20,cexp1_20,cexp2_20;
+    real             dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21,cexp1_21,cexp2_21;
+    real             dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22,cexp1_22,cexp2_22;
+    real             velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    real             rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,br,vvdwexp,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    real             c6grid_00;
+    real             c6grid_01;
+    real             c6grid_02;
+    real             c6grid_10;
+    real             c6grid_11;
+    real             c6grid_12;
+    real             c6grid_20;
+    real             c6grid_21;
+    real             c6grid_22;
+    real             ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,sh_lj_ewald;
+    real            *vdwgridparam;
+    int              ewitab;
+    real             ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace;
+    real             *ewtab;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = fr->epsfac;
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    ewclj            = fr->ewaldcoeff_lj;
+    sh_lj_ewald             = fr->ic->sh_lj_ewald;
+    ewclj2           = ewclj*ewclj;
+    ewclj6           = ewclj2*ewclj2*ewclj2;
+
+    sh_ewald         = fr->ic->sh_ewald;
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = fr->ic->tabq_scale;
+    ewtabhalfspace   = 0.5/ewtabscale;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = facel*charge[inr+0];
+    iq1              = facel*charge[inr+1];
+    iq2              = facel*charge[inr+2];
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = charge[inr+0];
+    jq1              = charge[inr+1];
+    jq2              = charge[inr+2];
+    vdwjidx0         = 2*vdwtype[inr+0];
+    qq00             = iq0*jq0;
+    c6_00            = vdwparam[vdwioffset0+vdwjidx0];
+    c12_00           = vdwparam[vdwioffset0+vdwjidx0+1];
+    c6grid_00        = vdwgridparam[vdwioffset0+vdwjidx0];
+    qq01             = iq0*jq1;
+    qq02             = iq0*jq2;
+    qq10             = iq1*jq0;
+    qq11             = iq1*jq1;
+    qq12             = iq1*jq2;
+    qq20             = iq2*jq0;
+    qq21             = iq2*jq1;
+    qq22             = iq2*jq2;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = shX + x[i_coord_offset+DIM*0+XX];
+        iy0              = shY + x[i_coord_offset+DIM*0+YY];
+        iz0              = shZ + x[i_coord_offset+DIM*0+ZZ];
+        ix1              = shX + x[i_coord_offset+DIM*1+XX];
+        iy1              = shY + x[i_coord_offset+DIM*1+YY];
+        iz1              = shZ + x[i_coord_offset+DIM*1+ZZ];
+        ix2              = shX + x[i_coord_offset+DIM*2+XX];
+        iy2              = shY + x[i_coord_offset+DIM*2+YY];
+        iz2              = shZ + x[i_coord_offset+DIM*2+ZZ];
+
+        fix0             = 0.0;
+        fiy0             = 0.0;
+        fiz0             = 0.0;
+        fix1             = 0.0;
+        fiy1             = 0.0;
+        fiz1             = 0.0;
+        fix2             = 0.0;
+        fiy2             = 0.0;
+        fiz2             = 0.0;
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end; jidx++)
+        {
+            /* Get j neighbor index, and coordinate index */
+            jnr              = jjnr[jidx];
+            j_coord_offset   = DIM*jnr;
+
+            /* load j atom coordinates */
+            jx0              = x[j_coord_offset+DIM*0+XX];
+            jy0              = x[j_coord_offset+DIM*0+YY];
+            jz0              = x[j_coord_offset+DIM*0+ZZ];
+            jx1              = x[j_coord_offset+DIM*1+XX];
+            jy1              = x[j_coord_offset+DIM*1+YY];
+            jz1              = x[j_coord_offset+DIM*1+ZZ];
+            jx2              = x[j_coord_offset+DIM*2+XX];
+            jy2              = x[j_coord_offset+DIM*2+YY];
+            jz2              = x[j_coord_offset+DIM*2+ZZ];
+
+            /* Calculate displacement vector */
+            dx00             = ix0 - jx0;
+            dy00             = iy0 - jy0;
+            dz00             = iz0 - jz0;
+            dx01             = ix0 - jx1;
+            dy01             = iy0 - jy1;
+            dz01             = iz0 - jz1;
+            dx02             = ix0 - jx2;
+            dy02             = iy0 - jy2;
+            dz02             = iz0 - jz2;
+            dx10             = ix1 - jx0;
+            dy10             = iy1 - jy0;
+            dz10             = iz1 - jz0;
+            dx11             = ix1 - jx1;
+            dy11             = iy1 - jy1;
+            dz11             = iz1 - jz1;
+            dx12             = ix1 - jx2;
+            dy12             = iy1 - jy2;
+            dz12             = iz1 - jz2;
+            dx20             = ix2 - jx0;
+            dy20             = iy2 - jy0;
+            dz20             = iz2 - jz0;
+            dx21             = ix2 - jx1;
+            dy21             = iy2 - jy1;
+            dz21             = iz2 - jz1;
+            dx22             = ix2 - jx2;
+            dy22             = iy2 - jy2;
+            dz22             = iz2 - jz2;
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = dx00*dx00+dy00*dy00+dz00*dz00;
+            rsq01            = dx01*dx01+dy01*dy01+dz01*dz01;
+            rsq02            = dx02*dx02+dy02*dy02+dz02*dz02;
+            rsq10            = dx10*dx10+dy10*dy10+dz10*dz10;
+            rsq11            = dx11*dx11+dy11*dy11+dz11*dz11;
+            rsq12            = dx12*dx12+dy12*dy12+dz12*dz12;
+            rsq20            = dx20*dx20+dy20*dy20+dz20*dz20;
+            rsq21            = dx21*dx21+dy21*dy21+dz21*dz21;
+            rsq22            = dx22*dx22+dy22*dy22+dz22*dz22;
+
+            rinv00           = gmx_invsqrt(rsq00);
+            rinv01           = gmx_invsqrt(rsq01);
+            rinv02           = gmx_invsqrt(rsq02);
+            rinv10           = gmx_invsqrt(rsq10);
+            rinv11           = gmx_invsqrt(rsq11);
+            rinv12           = gmx_invsqrt(rsq12);
+            rinv20           = gmx_invsqrt(rsq20);
+            rinv21           = gmx_invsqrt(rsq21);
+            rinv22           = gmx_invsqrt(rsq22);
+
+            rinvsq00         = rinv00*rinv00;
+            rinvsq01         = rinv01*rinv01;
+            rinvsq02         = rinv02*rinv02;
+            rinvsq10         = rinv10*rinv10;
+            rinvsq11         = rinv11*rinv11;
+            rinvsq12         = rinv12*rinv12;
+            rinvsq20         = rinv20*rinv20;
+            rinvsq21         = rinv21*rinv21;
+            rinvsq22         = rinv22*rinv22;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = rsq00*rinv00;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r00*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq00*rinv00*(rinvsq00-felec);
+
+            rinvsix          = rinvsq00*rinvsq00*rinvsq00;
+            ewcljrsq         = ewclj2*rsq00;
+            exponent        = exp(-ewcljrsq);
+            poly             = exponent*(1.0 + ewcljrsq + ewcljrsq*ewcljrsq*0.5);
+            fvdw             = (((c12_00*rinvsix - c6_00 + c6grid_00*(1.0-poly))*rinvsix) - c6grid_00*(1.0/6.0)*exponent*ewclj6)*rinvsq00;
+
+            fscal            = felec+fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx00;
+            ty               = fscal*dy00;
+            tz               = fscal*dz00;
+
+            /* Update vectorial force */
+            fix0            += tx;
+            fiy0            += ty;
+            fiz0            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = rsq01*rinv01;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r01*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq01*rinv01*(rinvsq01-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx01;
+            ty               = fscal*dy01;
+            tz               = fscal*dz01;
+
+            /* Update vectorial force */
+            fix0            += tx;
+            fiy0            += ty;
+            fiz0            += tz;
+            f[j_coord_offset+DIM*1+XX] -= tx;
+            f[j_coord_offset+DIM*1+YY] -= ty;
+            f[j_coord_offset+DIM*1+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = rsq02*rinv02;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r02*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq02*rinv02*(rinvsq02-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx02;
+            ty               = fscal*dy02;
+            tz               = fscal*dz02;
+
+            /* Update vectorial force */
+            fix0            += tx;
+            fiy0            += ty;
+            fiz0            += tz;
+            f[j_coord_offset+DIM*2+XX] -= tx;
+            f[j_coord_offset+DIM*2+YY] -= ty;
+            f[j_coord_offset+DIM*2+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = rsq10*rinv10;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r10*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq10*rinv10*(rinvsq10-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx10;
+            ty               = fscal*dy10;
+            tz               = fscal*dz10;
+
+            /* Update vectorial force */
+            fix1            += tx;
+            fiy1            += ty;
+            fiz1            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = rsq11*rinv11;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r11*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq11*rinv11*(rinvsq11-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx11;
+            ty               = fscal*dy11;
+            tz               = fscal*dz11;
+
+            /* Update vectorial force */
+            fix1            += tx;
+            fiy1            += ty;
+            fiz1            += tz;
+            f[j_coord_offset+DIM*1+XX] -= tx;
+            f[j_coord_offset+DIM*1+YY] -= ty;
+            f[j_coord_offset+DIM*1+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = rsq12*rinv12;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r12*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq12*rinv12*(rinvsq12-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx12;
+            ty               = fscal*dy12;
+            tz               = fscal*dz12;
+
+            /* Update vectorial force */
+            fix1            += tx;
+            fiy1            += ty;
+            fiz1            += tz;
+            f[j_coord_offset+DIM*2+XX] -= tx;
+            f[j_coord_offset+DIM*2+YY] -= ty;
+            f[j_coord_offset+DIM*2+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = rsq20*rinv20;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r20*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq20*rinv20*(rinvsq20-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx20;
+            ty               = fscal*dy20;
+            tz               = fscal*dz20;
+
+            /* Update vectorial force */
+            fix2            += tx;
+            fiy2            += ty;
+            fiz2            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = rsq21*rinv21;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r21*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq21*rinv21*(rinvsq21-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx21;
+            ty               = fscal*dy21;
+            tz               = fscal*dz21;
+
+            /* Update vectorial force */
+            fix2            += tx;
+            fiy2            += ty;
+            fiz2            += tz;
+            f[j_coord_offset+DIM*1+XX] -= tx;
+            f[j_coord_offset+DIM*1+YY] -= ty;
+            f[j_coord_offset+DIM*1+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = rsq22*rinv22;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r22*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq22*rinv22*(rinvsq22-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx22;
+            ty               = fscal*dy22;
+            tz               = fscal*dz22;
+
+            /* Update vectorial force */
+            fix2            += tx;
+            fiy2            += ty;
+            fiz2            += tz;
+            f[j_coord_offset+DIM*2+XX] -= tx;
+            f[j_coord_offset+DIM*2+YY] -= ty;
+            f[j_coord_offset+DIM*2+ZZ] -= tz;
+
+            /* Inner loop uses 318 flops */
+        }
+        /* End of innermost loop */
+
+        tx = ty = tz = 0;
+        f[i_coord_offset+DIM*0+XX] += fix0;
+        f[i_coord_offset+DIM*0+YY] += fiy0;
+        f[i_coord_offset+DIM*0+ZZ] += fiz0;
+        tx                         += fix0;
+        ty                         += fiy0;
+        tz                         += fiz0;
+        f[i_coord_offset+DIM*1+XX] += fix1;
+        f[i_coord_offset+DIM*1+YY] += fiy1;
+        f[i_coord_offset+DIM*1+ZZ] += fiz1;
+        tx                         += fix1;
+        ty                         += fiy1;
+        tz                         += fiz1;
+        f[i_coord_offset+DIM*2+XX] += fix2;
+        f[i_coord_offset+DIM*2+YY] += fiy2;
+        f[i_coord_offset+DIM*2+ZZ] += fiz2;
+        tx                         += fix2;
+        ty                         += fiy2;
+        tz                         += fiz2;
+        fshift[i_shift_offset+XX]  += tx;
+        fshift[i_shift_offset+YY]  += ty;
+        fshift[i_shift_offset+ZZ]  += tz;
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 30 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*30 + inneriter*318);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecEw_VdwLJEw_GeomW4P1_c.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecEw_VdwLJEw_GeomW4P1_c.c
new file mode 100644 (file)
index 0000000..7baa0a4
--- /dev/null
@@ -0,0 +1,784 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS c kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_c
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_c
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    int              i_shift_offset,i_coord_offset,j_coord_offset;
+    int              j_index_start,j_index_end;
+    int              nri,inr,ggid,iidx,jidx,jnr,outeriter,inneriter;
+    real             shX,shY,shZ,tx,ty,tz,fscal,rcutoff,rcutoff2;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             *shiftvec,*fshift,*x,*f;
+    int              vdwioffset0;
+    real             ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    real             ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    real             ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    real             ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0;
+    real             jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    real             dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00,cexp1_00,cexp2_00;
+    real             dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10,cexp1_10,cexp2_10;
+    real             dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20,cexp1_20,cexp2_20;
+    real             dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30,cexp1_30,cexp2_30;
+    real             velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    real             rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,br,vvdwexp,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    real             c6grid_00;
+    real             c6grid_10;
+    real             c6grid_20;
+    real             c6grid_30;
+    real             ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,sh_lj_ewald;
+    real            *vdwgridparam;
+    int              ewitab;
+    real             ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace;
+    real             *ewtab;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = fr->epsfac;
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    ewclj            = fr->ewaldcoeff_lj;
+    sh_lj_ewald             = fr->ic->sh_lj_ewald;
+    ewclj2           = ewclj*ewclj;
+    ewclj6           = ewclj2*ewclj2*ewclj2;
+
+    sh_ewald         = fr->ic->sh_ewald;
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = fr->ic->tabq_scale;
+    ewtabhalfspace   = 0.5/ewtabscale;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = facel*charge[inr+1];
+    iq2              = facel*charge[inr+2];
+    iq3              = facel*charge[inr+3];
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = shX + x[i_coord_offset+DIM*0+XX];
+        iy0              = shY + x[i_coord_offset+DIM*0+YY];
+        iz0              = shZ + x[i_coord_offset+DIM*0+ZZ];
+        ix1              = shX + x[i_coord_offset+DIM*1+XX];
+        iy1              = shY + x[i_coord_offset+DIM*1+YY];
+        iz1              = shZ + x[i_coord_offset+DIM*1+ZZ];
+        ix2              = shX + x[i_coord_offset+DIM*2+XX];
+        iy2              = shY + x[i_coord_offset+DIM*2+YY];
+        iz2              = shZ + x[i_coord_offset+DIM*2+ZZ];
+        ix3              = shX + x[i_coord_offset+DIM*3+XX];
+        iy3              = shY + x[i_coord_offset+DIM*3+YY];
+        iz3              = shZ + x[i_coord_offset+DIM*3+ZZ];
+
+        fix0             = 0.0;
+        fiy0             = 0.0;
+        fiz0             = 0.0;
+        fix1             = 0.0;
+        fiy1             = 0.0;
+        fiz1             = 0.0;
+        fix2             = 0.0;
+        fiy2             = 0.0;
+        fiz2             = 0.0;
+        fix3             = 0.0;
+        fiy3             = 0.0;
+        fiz3             = 0.0;
+
+        /* Reset potential sums */
+        velecsum         = 0.0;
+        vvdwsum          = 0.0;
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end; jidx++)
+        {
+            /* Get j neighbor index, and coordinate index */
+            jnr              = jjnr[jidx];
+            j_coord_offset   = DIM*jnr;
+
+            /* load j atom coordinates */
+            jx0              = x[j_coord_offset+DIM*0+XX];
+            jy0              = x[j_coord_offset+DIM*0+YY];
+            jz0              = x[j_coord_offset+DIM*0+ZZ];
+
+            /* Calculate displacement vector */
+            dx00             = ix0 - jx0;
+            dy00             = iy0 - jy0;
+            dz00             = iz0 - jz0;
+            dx10             = ix1 - jx0;
+            dy10             = iy1 - jy0;
+            dz10             = iz1 - jz0;
+            dx20             = ix2 - jx0;
+            dy20             = iy2 - jy0;
+            dz20             = iz2 - jz0;
+            dx30             = ix3 - jx0;
+            dy30             = iy3 - jy0;
+            dz30             = iz3 - jz0;
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = dx00*dx00+dy00*dy00+dz00*dz00;
+            rsq10            = dx10*dx10+dy10*dy10+dz10*dz10;
+            rsq20            = dx20*dx20+dy20*dy20+dz20*dz20;
+            rsq30            = dx30*dx30+dy30*dy30+dz30*dz30;
+
+            rinv00           = gmx_invsqrt(rsq00);
+            rinv10           = gmx_invsqrt(rsq10);
+            rinv20           = gmx_invsqrt(rsq20);
+            rinv30           = gmx_invsqrt(rsq30);
+
+            rinvsq00         = rinv00*rinv00;
+            rinvsq10         = rinv10*rinv10;
+            rinvsq20         = rinv20*rinv20;
+            rinvsq30         = rinv30*rinv30;
+
+            /* Load parameters for j particles */
+            jq0              = charge[jnr+0];
+            vdwjidx0         = 2*vdwtype[jnr+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = rsq00*rinv00;
+
+            c6_00            = vdwparam[vdwioffset0+vdwjidx0];
+            c12_00           = vdwparam[vdwioffset0+vdwjidx0+1];
+            c6grid_00        = vdwgridparam[vdwioffset0+vdwjidx0];
+
+            rinvsix          = rinvsq00*rinvsq00*rinvsq00;
+            ewcljrsq         = ewclj2*rsq00;
+            exponent        = exp(-ewcljrsq);
+            poly             = exponent*(1.0 + ewcljrsq + ewcljrsq*ewcljrsq*0.5);
+            vvdw6            = (c6_00-c6grid_00*(1.0-poly))*rinvsix;
+            vvdw12           = c12_00*rinvsix*rinvsix;
+            vvdw             = vvdw12*(1.0/12.0) - vvdw6*(1.0/6.0);
+            fvdw             = (vvdw12 - vvdw6 - c6grid_00*(1.0/6.0)*exponent*ewclj6)*rinvsq00;
+
+            /* Update potential sums from outer loop */
+            vvdwsum         += vvdw;
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx00;
+            ty               = fscal*dy00;
+            tz               = fscal*dz00;
+
+            /* Update vectorial force */
+            fix0            += tx;
+            fiy0            += ty;
+            fiz0            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = rsq10*rinv10;
+
+            qq10             = iq1*jq0;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r10*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq10*(rinv10-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq10*rinv10*(rinvsq10-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx10;
+            ty               = fscal*dy10;
+            tz               = fscal*dz10;
+
+            /* Update vectorial force */
+            fix1            += tx;
+            fiy1            += ty;
+            fiz1            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = rsq20*rinv20;
+
+            qq20             = iq2*jq0;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r20*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq20*(rinv20-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq20*rinv20*(rinvsq20-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx20;
+            ty               = fscal*dy20;
+            tz               = fscal*dz20;
+
+            /* Update vectorial force */
+            fix2            += tx;
+            fiy2            += ty;
+            fiz2            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = rsq30*rinv30;
+
+            qq30             = iq3*jq0;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r30*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq30*(rinv30-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq30*rinv30*(rinvsq30-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx30;
+            ty               = fscal*dy30;
+            tz               = fscal*dz30;
+
+            /* Update vectorial force */
+            fix3            += tx;
+            fiy3            += ty;
+            fiz3            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            /* Inner loop uses 172 flops */
+        }
+        /* End of innermost loop */
+
+        tx = ty = tz = 0;
+        f[i_coord_offset+DIM*0+XX] += fix0;
+        f[i_coord_offset+DIM*0+YY] += fiy0;
+        f[i_coord_offset+DIM*0+ZZ] += fiz0;
+        tx                         += fix0;
+        ty                         += fiy0;
+        tz                         += fiz0;
+        f[i_coord_offset+DIM*1+XX] += fix1;
+        f[i_coord_offset+DIM*1+YY] += fiy1;
+        f[i_coord_offset+DIM*1+ZZ] += fiz1;
+        tx                         += fix1;
+        ty                         += fiy1;
+        tz                         += fiz1;
+        f[i_coord_offset+DIM*2+XX] += fix2;
+        f[i_coord_offset+DIM*2+YY] += fiy2;
+        f[i_coord_offset+DIM*2+ZZ] += fiz2;
+        tx                         += fix2;
+        ty                         += fiy2;
+        tz                         += fiz2;
+        f[i_coord_offset+DIM*3+XX] += fix3;
+        f[i_coord_offset+DIM*3+YY] += fiy3;
+        f[i_coord_offset+DIM*3+ZZ] += fiz3;
+        tx                         += fix3;
+        ty                         += fiy3;
+        tz                         += fiz3;
+        fshift[i_shift_offset+XX]  += tx;
+        fshift[i_shift_offset+YY]  += ty;
+        fshift[i_shift_offset+ZZ]  += tz;
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        kernel_data->energygrp_elec[ggid] += velecsum;
+        kernel_data->energygrp_vdw[ggid] += vvdwsum;
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 41 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*41 + inneriter*172);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_c
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_c
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    int              i_shift_offset,i_coord_offset,j_coord_offset;
+    int              j_index_start,j_index_end;
+    int              nri,inr,ggid,iidx,jidx,jnr,outeriter,inneriter;
+    real             shX,shY,shZ,tx,ty,tz,fscal,rcutoff,rcutoff2;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             *shiftvec,*fshift,*x,*f;
+    int              vdwioffset0;
+    real             ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    real             ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    real             ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    real             ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0;
+    real             jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    real             dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00,cexp1_00,cexp2_00;
+    real             dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10,cexp1_10,cexp2_10;
+    real             dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20,cexp1_20,cexp2_20;
+    real             dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30,cexp1_30,cexp2_30;
+    real             velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    real             rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,br,vvdwexp,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    real             c6grid_00;
+    real             c6grid_10;
+    real             c6grid_20;
+    real             c6grid_30;
+    real             ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,sh_lj_ewald;
+    real            *vdwgridparam;
+    int              ewitab;
+    real             ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace;
+    real             *ewtab;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = fr->epsfac;
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    ewclj            = fr->ewaldcoeff_lj;
+    sh_lj_ewald             = fr->ic->sh_lj_ewald;
+    ewclj2           = ewclj*ewclj;
+    ewclj6           = ewclj2*ewclj2*ewclj2;
+
+    sh_ewald         = fr->ic->sh_ewald;
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = fr->ic->tabq_scale;
+    ewtabhalfspace   = 0.5/ewtabscale;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = facel*charge[inr+1];
+    iq2              = facel*charge[inr+2];
+    iq3              = facel*charge[inr+3];
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = shX + x[i_coord_offset+DIM*0+XX];
+        iy0              = shY + x[i_coord_offset+DIM*0+YY];
+        iz0              = shZ + x[i_coord_offset+DIM*0+ZZ];
+        ix1              = shX + x[i_coord_offset+DIM*1+XX];
+        iy1              = shY + x[i_coord_offset+DIM*1+YY];
+        iz1              = shZ + x[i_coord_offset+DIM*1+ZZ];
+        ix2              = shX + x[i_coord_offset+DIM*2+XX];
+        iy2              = shY + x[i_coord_offset+DIM*2+YY];
+        iz2              = shZ + x[i_coord_offset+DIM*2+ZZ];
+        ix3              = shX + x[i_coord_offset+DIM*3+XX];
+        iy3              = shY + x[i_coord_offset+DIM*3+YY];
+        iz3              = shZ + x[i_coord_offset+DIM*3+ZZ];
+
+        fix0             = 0.0;
+        fiy0             = 0.0;
+        fiz0             = 0.0;
+        fix1             = 0.0;
+        fiy1             = 0.0;
+        fiz1             = 0.0;
+        fix2             = 0.0;
+        fiy2             = 0.0;
+        fiz2             = 0.0;
+        fix3             = 0.0;
+        fiy3             = 0.0;
+        fiz3             = 0.0;
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end; jidx++)
+        {
+            /* Get j neighbor index, and coordinate index */
+            jnr              = jjnr[jidx];
+            j_coord_offset   = DIM*jnr;
+
+            /* load j atom coordinates */
+            jx0              = x[j_coord_offset+DIM*0+XX];
+            jy0              = x[j_coord_offset+DIM*0+YY];
+            jz0              = x[j_coord_offset+DIM*0+ZZ];
+
+            /* Calculate displacement vector */
+            dx00             = ix0 - jx0;
+            dy00             = iy0 - jy0;
+            dz00             = iz0 - jz0;
+            dx10             = ix1 - jx0;
+            dy10             = iy1 - jy0;
+            dz10             = iz1 - jz0;
+            dx20             = ix2 - jx0;
+            dy20             = iy2 - jy0;
+            dz20             = iz2 - jz0;
+            dx30             = ix3 - jx0;
+            dy30             = iy3 - jy0;
+            dz30             = iz3 - jz0;
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = dx00*dx00+dy00*dy00+dz00*dz00;
+            rsq10            = dx10*dx10+dy10*dy10+dz10*dz10;
+            rsq20            = dx20*dx20+dy20*dy20+dz20*dz20;
+            rsq30            = dx30*dx30+dy30*dy30+dz30*dz30;
+
+            rinv00           = gmx_invsqrt(rsq00);
+            rinv10           = gmx_invsqrt(rsq10);
+            rinv20           = gmx_invsqrt(rsq20);
+            rinv30           = gmx_invsqrt(rsq30);
+
+            rinvsq00         = rinv00*rinv00;
+            rinvsq10         = rinv10*rinv10;
+            rinvsq20         = rinv20*rinv20;
+            rinvsq30         = rinv30*rinv30;
+
+            /* Load parameters for j particles */
+            jq0              = charge[jnr+0];
+            vdwjidx0         = 2*vdwtype[jnr+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = rsq00*rinv00;
+
+            c6_00            = vdwparam[vdwioffset0+vdwjidx0];
+            c12_00           = vdwparam[vdwioffset0+vdwjidx0+1];
+            c6grid_00        = vdwgridparam[vdwioffset0+vdwjidx0];
+
+            rinvsix          = rinvsq00*rinvsq00*rinvsq00;
+            ewcljrsq         = ewclj2*rsq00;
+            exponent        = exp(-ewcljrsq);
+            poly             = exponent*(1.0 + ewcljrsq + ewcljrsq*ewcljrsq*0.5);
+            fvdw             = (((c12_00*rinvsix - c6_00 + c6grid_00*(1.0-poly))*rinvsix) - c6grid_00*(1.0/6.0)*exponent*ewclj6)*rinvsq00;
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx00;
+            ty               = fscal*dy00;
+            tz               = fscal*dz00;
+
+            /* Update vectorial force */
+            fix0            += tx;
+            fiy0            += ty;
+            fiz0            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = rsq10*rinv10;
+
+            qq10             = iq1*jq0;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r10*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq10*rinv10*(rinvsq10-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx10;
+            ty               = fscal*dy10;
+            tz               = fscal*dz10;
+
+            /* Update vectorial force */
+            fix1            += tx;
+            fiy1            += ty;
+            fiz1            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = rsq20*rinv20;
+
+            qq20             = iq2*jq0;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r20*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq20*rinv20*(rinvsq20-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx20;
+            ty               = fscal*dy20;
+            tz               = fscal*dz20;
+
+            /* Update vectorial force */
+            fix2            += tx;
+            fiy2            += ty;
+            fiz2            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = rsq30*rinv30;
+
+            qq30             = iq3*jq0;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r30*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq30*rinv30*(rinvsq30-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx30;
+            ty               = fscal*dy30;
+            tz               = fscal*dz30;
+
+            /* Update vectorial force */
+            fix3            += tx;
+            fiy3            += ty;
+            fiz3            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            /* Inner loop uses 146 flops */
+        }
+        /* End of innermost loop */
+
+        tx = ty = tz = 0;
+        f[i_coord_offset+DIM*0+XX] += fix0;
+        f[i_coord_offset+DIM*0+YY] += fiy0;
+        f[i_coord_offset+DIM*0+ZZ] += fiz0;
+        tx                         += fix0;
+        ty                         += fiy0;
+        tz                         += fiz0;
+        f[i_coord_offset+DIM*1+XX] += fix1;
+        f[i_coord_offset+DIM*1+YY] += fiy1;
+        f[i_coord_offset+DIM*1+ZZ] += fiz1;
+        tx                         += fix1;
+        ty                         += fiy1;
+        tz                         += fiz1;
+        f[i_coord_offset+DIM*2+XX] += fix2;
+        f[i_coord_offset+DIM*2+YY] += fiy2;
+        f[i_coord_offset+DIM*2+ZZ] += fiz2;
+        tx                         += fix2;
+        ty                         += fiy2;
+        tz                         += fiz2;
+        f[i_coord_offset+DIM*3+XX] += fix3;
+        f[i_coord_offset+DIM*3+YY] += fiy3;
+        f[i_coord_offset+DIM*3+ZZ] += fiz3;
+        tx                         += fix3;
+        ty                         += fiy3;
+        tz                         += fiz3;
+        fshift[i_shift_offset+XX]  += tx;
+        fshift[i_shift_offset+YY]  += ty;
+        fshift[i_shift_offset+ZZ]  += tz;
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 39 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*39 + inneriter*146);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecEw_VdwLJEw_GeomW4W4_c.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecEw_VdwLJEw_GeomW4W4_c.c
new file mode 100644 (file)
index 0000000..064da21
--- /dev/null
@@ -0,0 +1,1306 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS c kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_c
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_c
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    int              i_shift_offset,i_coord_offset,j_coord_offset;
+    int              j_index_start,j_index_end;
+    int              nri,inr,ggid,iidx,jidx,jnr,outeriter,inneriter;
+    real             shX,shY,shZ,tx,ty,tz,fscal,rcutoff,rcutoff2;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             *shiftvec,*fshift,*x,*f;
+    int              vdwioffset0;
+    real             ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    real             ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    real             ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    real             ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0;
+    real             jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1;
+    real             jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2;
+    real             jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3;
+    real             jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    real             dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00,cexp1_00,cexp2_00;
+    real             dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11,cexp1_11,cexp2_11;
+    real             dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12,cexp1_12,cexp2_12;
+    real             dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13,cexp1_13,cexp2_13;
+    real             dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21,cexp1_21,cexp2_21;
+    real             dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22,cexp1_22,cexp2_22;
+    real             dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23,cexp1_23,cexp2_23;
+    real             dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31,cexp1_31,cexp2_31;
+    real             dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32,cexp1_32,cexp2_32;
+    real             dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33,cexp1_33,cexp2_33;
+    real             velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    real             rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,br,vvdwexp,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    real             c6grid_00;
+    real             c6grid_11;
+    real             c6grid_12;
+    real             c6grid_13;
+    real             c6grid_21;
+    real             c6grid_22;
+    real             c6grid_23;
+    real             c6grid_31;
+    real             c6grid_32;
+    real             c6grid_33;
+    real             ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,sh_lj_ewald;
+    real            *vdwgridparam;
+    int              ewitab;
+    real             ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace;
+    real             *ewtab;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = fr->epsfac;
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    ewclj            = fr->ewaldcoeff_lj;
+    sh_lj_ewald             = fr->ic->sh_lj_ewald;
+    ewclj2           = ewclj*ewclj;
+    ewclj6           = ewclj2*ewclj2*ewclj2;
+
+    sh_ewald         = fr->ic->sh_ewald;
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = fr->ic->tabq_scale;
+    ewtabhalfspace   = 0.5/ewtabscale;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = facel*charge[inr+1];
+    iq2              = facel*charge[inr+2];
+    iq3              = facel*charge[inr+3];
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = charge[inr+1];
+    jq2              = charge[inr+2];
+    jq3              = charge[inr+3];
+    vdwjidx0         = 2*vdwtype[inr+0];
+    c6_00            = vdwparam[vdwioffset0+vdwjidx0];
+    c12_00           = vdwparam[vdwioffset0+vdwjidx0+1];
+    c6grid_00        = vdwgridparam[vdwioffset0+vdwjidx0];
+    qq11             = iq1*jq1;
+    qq12             = iq1*jq2;
+    qq13             = iq1*jq3;
+    qq21             = iq2*jq1;
+    qq22             = iq2*jq2;
+    qq23             = iq2*jq3;
+    qq31             = iq3*jq1;
+    qq32             = iq3*jq2;
+    qq33             = iq3*jq3;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = shX + x[i_coord_offset+DIM*0+XX];
+        iy0              = shY + x[i_coord_offset+DIM*0+YY];
+        iz0              = shZ + x[i_coord_offset+DIM*0+ZZ];
+        ix1              = shX + x[i_coord_offset+DIM*1+XX];
+        iy1              = shY + x[i_coord_offset+DIM*1+YY];
+        iz1              = shZ + x[i_coord_offset+DIM*1+ZZ];
+        ix2              = shX + x[i_coord_offset+DIM*2+XX];
+        iy2              = shY + x[i_coord_offset+DIM*2+YY];
+        iz2              = shZ + x[i_coord_offset+DIM*2+ZZ];
+        ix3              = shX + x[i_coord_offset+DIM*3+XX];
+        iy3              = shY + x[i_coord_offset+DIM*3+YY];
+        iz3              = shZ + x[i_coord_offset+DIM*3+ZZ];
+
+        fix0             = 0.0;
+        fiy0             = 0.0;
+        fiz0             = 0.0;
+        fix1             = 0.0;
+        fiy1             = 0.0;
+        fiz1             = 0.0;
+        fix2             = 0.0;
+        fiy2             = 0.0;
+        fiz2             = 0.0;
+        fix3             = 0.0;
+        fiy3             = 0.0;
+        fiz3             = 0.0;
+
+        /* Reset potential sums */
+        velecsum         = 0.0;
+        vvdwsum          = 0.0;
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end; jidx++)
+        {
+            /* Get j neighbor index, and coordinate index */
+            jnr              = jjnr[jidx];
+            j_coord_offset   = DIM*jnr;
+
+            /* load j atom coordinates */
+            jx0              = x[j_coord_offset+DIM*0+XX];
+            jy0              = x[j_coord_offset+DIM*0+YY];
+            jz0              = x[j_coord_offset+DIM*0+ZZ];
+            jx1              = x[j_coord_offset+DIM*1+XX];
+            jy1              = x[j_coord_offset+DIM*1+YY];
+            jz1              = x[j_coord_offset+DIM*1+ZZ];
+            jx2              = x[j_coord_offset+DIM*2+XX];
+            jy2              = x[j_coord_offset+DIM*2+YY];
+            jz2              = x[j_coord_offset+DIM*2+ZZ];
+            jx3              = x[j_coord_offset+DIM*3+XX];
+            jy3              = x[j_coord_offset+DIM*3+YY];
+            jz3              = x[j_coord_offset+DIM*3+ZZ];
+
+            /* Calculate displacement vector */
+            dx00             = ix0 - jx0;
+            dy00             = iy0 - jy0;
+            dz00             = iz0 - jz0;
+            dx11             = ix1 - jx1;
+            dy11             = iy1 - jy1;
+            dz11             = iz1 - jz1;
+            dx12             = ix1 - jx2;
+            dy12             = iy1 - jy2;
+            dz12             = iz1 - jz2;
+            dx13             = ix1 - jx3;
+            dy13             = iy1 - jy3;
+            dz13             = iz1 - jz3;
+            dx21             = ix2 - jx1;
+            dy21             = iy2 - jy1;
+            dz21             = iz2 - jz1;
+            dx22             = ix2 - jx2;
+            dy22             = iy2 - jy2;
+            dz22             = iz2 - jz2;
+            dx23             = ix2 - jx3;
+            dy23             = iy2 - jy3;
+            dz23             = iz2 - jz3;
+            dx31             = ix3 - jx1;
+            dy31             = iy3 - jy1;
+            dz31             = iz3 - jz1;
+            dx32             = ix3 - jx2;
+            dy32             = iy3 - jy2;
+            dz32             = iz3 - jz2;
+            dx33             = ix3 - jx3;
+            dy33             = iy3 - jy3;
+            dz33             = iz3 - jz3;
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = dx00*dx00+dy00*dy00+dz00*dz00;
+            rsq11            = dx11*dx11+dy11*dy11+dz11*dz11;
+            rsq12            = dx12*dx12+dy12*dy12+dz12*dz12;
+            rsq13            = dx13*dx13+dy13*dy13+dz13*dz13;
+            rsq21            = dx21*dx21+dy21*dy21+dz21*dz21;
+            rsq22            = dx22*dx22+dy22*dy22+dz22*dz22;
+            rsq23            = dx23*dx23+dy23*dy23+dz23*dz23;
+            rsq31            = dx31*dx31+dy31*dy31+dz31*dz31;
+            rsq32            = dx32*dx32+dy32*dy32+dz32*dz32;
+            rsq33            = dx33*dx33+dy33*dy33+dz33*dz33;
+
+            rinv00           = gmx_invsqrt(rsq00);
+            rinv11           = gmx_invsqrt(rsq11);
+            rinv12           = gmx_invsqrt(rsq12);
+            rinv13           = gmx_invsqrt(rsq13);
+            rinv21           = gmx_invsqrt(rsq21);
+            rinv22           = gmx_invsqrt(rsq22);
+            rinv23           = gmx_invsqrt(rsq23);
+            rinv31           = gmx_invsqrt(rsq31);
+            rinv32           = gmx_invsqrt(rsq32);
+            rinv33           = gmx_invsqrt(rsq33);
+
+            rinvsq00         = rinv00*rinv00;
+            rinvsq11         = rinv11*rinv11;
+            rinvsq12         = rinv12*rinv12;
+            rinvsq13         = rinv13*rinv13;
+            rinvsq21         = rinv21*rinv21;
+            rinvsq22         = rinv22*rinv22;
+            rinvsq23         = rinv23*rinv23;
+            rinvsq31         = rinv31*rinv31;
+            rinvsq32         = rinv32*rinv32;
+            rinvsq33         = rinv33*rinv33;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = rsq00*rinv00;
+
+            rinvsix          = rinvsq00*rinvsq00*rinvsq00;
+            ewcljrsq         = ewclj2*rsq00;
+            exponent        = exp(-ewcljrsq);
+            poly             = exponent*(1.0 + ewcljrsq + ewcljrsq*ewcljrsq*0.5);
+            vvdw6            = (c6_00-c6grid_00*(1.0-poly))*rinvsix;
+            vvdw12           = c12_00*rinvsix*rinvsix;
+            vvdw             = vvdw12*(1.0/12.0) - vvdw6*(1.0/6.0);
+            fvdw             = (vvdw12 - vvdw6 - c6grid_00*(1.0/6.0)*exponent*ewclj6)*rinvsq00;
+
+            /* Update potential sums from outer loop */
+            vvdwsum         += vvdw;
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx00;
+            ty               = fscal*dy00;
+            tz               = fscal*dz00;
+
+            /* Update vectorial force */
+            fix0            += tx;
+            fiy0            += ty;
+            fiz0            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = rsq11*rinv11;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r11*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq11*(rinv11-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq11*rinv11*(rinvsq11-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx11;
+            ty               = fscal*dy11;
+            tz               = fscal*dz11;
+
+            /* Update vectorial force */
+            fix1            += tx;
+            fiy1            += ty;
+            fiz1            += tz;
+            f[j_coord_offset+DIM*1+XX] -= tx;
+            f[j_coord_offset+DIM*1+YY] -= ty;
+            f[j_coord_offset+DIM*1+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = rsq12*rinv12;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r12*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq12*(rinv12-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq12*rinv12*(rinvsq12-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx12;
+            ty               = fscal*dy12;
+            tz               = fscal*dz12;
+
+            /* Update vectorial force */
+            fix1            += tx;
+            fiy1            += ty;
+            fiz1            += tz;
+            f[j_coord_offset+DIM*2+XX] -= tx;
+            f[j_coord_offset+DIM*2+YY] -= ty;
+            f[j_coord_offset+DIM*2+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = rsq13*rinv13;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r13*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq13*(rinv13-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq13*rinv13*(rinvsq13-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx13;
+            ty               = fscal*dy13;
+            tz               = fscal*dz13;
+
+            /* Update vectorial force */
+            fix1            += tx;
+            fiy1            += ty;
+            fiz1            += tz;
+            f[j_coord_offset+DIM*3+XX] -= tx;
+            f[j_coord_offset+DIM*3+YY] -= ty;
+            f[j_coord_offset+DIM*3+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = rsq21*rinv21;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r21*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq21*(rinv21-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq21*rinv21*(rinvsq21-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx21;
+            ty               = fscal*dy21;
+            tz               = fscal*dz21;
+
+            /* Update vectorial force */
+            fix2            += tx;
+            fiy2            += ty;
+            fiz2            += tz;
+            f[j_coord_offset+DIM*1+XX] -= tx;
+            f[j_coord_offset+DIM*1+YY] -= ty;
+            f[j_coord_offset+DIM*1+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = rsq22*rinv22;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r22*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq22*(rinv22-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq22*rinv22*(rinvsq22-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx22;
+            ty               = fscal*dy22;
+            tz               = fscal*dz22;
+
+            /* Update vectorial force */
+            fix2            += tx;
+            fiy2            += ty;
+            fiz2            += tz;
+            f[j_coord_offset+DIM*2+XX] -= tx;
+            f[j_coord_offset+DIM*2+YY] -= ty;
+            f[j_coord_offset+DIM*2+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = rsq23*rinv23;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r23*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq23*(rinv23-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq23*rinv23*(rinvsq23-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx23;
+            ty               = fscal*dy23;
+            tz               = fscal*dz23;
+
+            /* Update vectorial force */
+            fix2            += tx;
+            fiy2            += ty;
+            fiz2            += tz;
+            f[j_coord_offset+DIM*3+XX] -= tx;
+            f[j_coord_offset+DIM*3+YY] -= ty;
+            f[j_coord_offset+DIM*3+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = rsq31*rinv31;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r31*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq31*(rinv31-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq31*rinv31*(rinvsq31-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx31;
+            ty               = fscal*dy31;
+            tz               = fscal*dz31;
+
+            /* Update vectorial force */
+            fix3            += tx;
+            fiy3            += ty;
+            fiz3            += tz;
+            f[j_coord_offset+DIM*1+XX] -= tx;
+            f[j_coord_offset+DIM*1+YY] -= ty;
+            f[j_coord_offset+DIM*1+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = rsq32*rinv32;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r32*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq32*(rinv32-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq32*rinv32*(rinvsq32-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx32;
+            ty               = fscal*dy32;
+            tz               = fscal*dz32;
+
+            /* Update vectorial force */
+            fix3            += tx;
+            fiy3            += ty;
+            fiz3            += tz;
+            f[j_coord_offset+DIM*2+XX] -= tx;
+            f[j_coord_offset+DIM*2+YY] -= ty;
+            f[j_coord_offset+DIM*2+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = rsq33*rinv33;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r33*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            ewitab           = 4*ewitab;
+            felec            = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            velec            = qq33*(rinv33-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+felec)));
+            felec            = qq33*rinv33*(rinvsq33-felec);
+
+            /* Update potential sums from outer loop */
+            velecsum        += velec;
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx33;
+            ty               = fscal*dy33;
+            tz               = fscal*dz33;
+
+            /* Update vectorial force */
+            fix3            += tx;
+            fiy3            += ty;
+            fiz3            += tz;
+            f[j_coord_offset+DIM*3+XX] -= tx;
+            f[j_coord_offset+DIM*3+YY] -= ty;
+            f[j_coord_offset+DIM*3+ZZ] -= tz;
+
+            /* Inner loop uses 409 flops */
+        }
+        /* End of innermost loop */
+
+        tx = ty = tz = 0;
+        f[i_coord_offset+DIM*0+XX] += fix0;
+        f[i_coord_offset+DIM*0+YY] += fiy0;
+        f[i_coord_offset+DIM*0+ZZ] += fiz0;
+        tx                         += fix0;
+        ty                         += fiy0;
+        tz                         += fiz0;
+        f[i_coord_offset+DIM*1+XX] += fix1;
+        f[i_coord_offset+DIM*1+YY] += fiy1;
+        f[i_coord_offset+DIM*1+ZZ] += fiz1;
+        tx                         += fix1;
+        ty                         += fiy1;
+        tz                         += fiz1;
+        f[i_coord_offset+DIM*2+XX] += fix2;
+        f[i_coord_offset+DIM*2+YY] += fiy2;
+        f[i_coord_offset+DIM*2+ZZ] += fiz2;
+        tx                         += fix2;
+        ty                         += fiy2;
+        tz                         += fiz2;
+        f[i_coord_offset+DIM*3+XX] += fix3;
+        f[i_coord_offset+DIM*3+YY] += fiy3;
+        f[i_coord_offset+DIM*3+ZZ] += fiz3;
+        tx                         += fix3;
+        ty                         += fiy3;
+        tz                         += fiz3;
+        fshift[i_shift_offset+XX]  += tx;
+        fshift[i_shift_offset+YY]  += ty;
+        fshift[i_shift_offset+ZZ]  += tz;
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        kernel_data->energygrp_elec[ggid] += velecsum;
+        kernel_data->energygrp_vdw[ggid] += vvdwsum;
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 41 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*41 + inneriter*409);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_c
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_c
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    int              i_shift_offset,i_coord_offset,j_coord_offset;
+    int              j_index_start,j_index_end;
+    int              nri,inr,ggid,iidx,jidx,jnr,outeriter,inneriter;
+    real             shX,shY,shZ,tx,ty,tz,fscal,rcutoff,rcutoff2;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             *shiftvec,*fshift,*x,*f;
+    int              vdwioffset0;
+    real             ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    real             ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    real             ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    real             ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0;
+    real             jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1;
+    real             jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2;
+    real             jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3;
+    real             jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    real             dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00,cexp1_00,cexp2_00;
+    real             dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11,cexp1_11,cexp2_11;
+    real             dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12,cexp1_12,cexp2_12;
+    real             dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13,cexp1_13,cexp2_13;
+    real             dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21,cexp1_21,cexp2_21;
+    real             dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22,cexp1_22,cexp2_22;
+    real             dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23,cexp1_23,cexp2_23;
+    real             dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31,cexp1_31,cexp2_31;
+    real             dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32,cexp1_32,cexp2_32;
+    real             dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33,cexp1_33,cexp2_33;
+    real             velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    real             rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,br,vvdwexp,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    real             c6grid_00;
+    real             c6grid_11;
+    real             c6grid_12;
+    real             c6grid_13;
+    real             c6grid_21;
+    real             c6grid_22;
+    real             c6grid_23;
+    real             c6grid_31;
+    real             c6grid_32;
+    real             c6grid_33;
+    real             ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,sh_lj_ewald;
+    real            *vdwgridparam;
+    int              ewitab;
+    real             ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace;
+    real             *ewtab;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = fr->epsfac;
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    ewclj            = fr->ewaldcoeff_lj;
+    sh_lj_ewald             = fr->ic->sh_lj_ewald;
+    ewclj2           = ewclj*ewclj;
+    ewclj6           = ewclj2*ewclj2*ewclj2;
+
+    sh_ewald         = fr->ic->sh_ewald;
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = fr->ic->tabq_scale;
+    ewtabhalfspace   = 0.5/ewtabscale;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = facel*charge[inr+1];
+    iq2              = facel*charge[inr+2];
+    iq3              = facel*charge[inr+3];
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = charge[inr+1];
+    jq2              = charge[inr+2];
+    jq3              = charge[inr+3];
+    vdwjidx0         = 2*vdwtype[inr+0];
+    c6_00            = vdwparam[vdwioffset0+vdwjidx0];
+    c12_00           = vdwparam[vdwioffset0+vdwjidx0+1];
+    c6grid_00        = vdwgridparam[vdwioffset0+vdwjidx0];
+    qq11             = iq1*jq1;
+    qq12             = iq1*jq2;
+    qq13             = iq1*jq3;
+    qq21             = iq2*jq1;
+    qq22             = iq2*jq2;
+    qq23             = iq2*jq3;
+    qq31             = iq3*jq1;
+    qq32             = iq3*jq2;
+    qq33             = iq3*jq3;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = shX + x[i_coord_offset+DIM*0+XX];
+        iy0              = shY + x[i_coord_offset+DIM*0+YY];
+        iz0              = shZ + x[i_coord_offset+DIM*0+ZZ];
+        ix1              = shX + x[i_coord_offset+DIM*1+XX];
+        iy1              = shY + x[i_coord_offset+DIM*1+YY];
+        iz1              = shZ + x[i_coord_offset+DIM*1+ZZ];
+        ix2              = shX + x[i_coord_offset+DIM*2+XX];
+        iy2              = shY + x[i_coord_offset+DIM*2+YY];
+        iz2              = shZ + x[i_coord_offset+DIM*2+ZZ];
+        ix3              = shX + x[i_coord_offset+DIM*3+XX];
+        iy3              = shY + x[i_coord_offset+DIM*3+YY];
+        iz3              = shZ + x[i_coord_offset+DIM*3+ZZ];
+
+        fix0             = 0.0;
+        fiy0             = 0.0;
+        fiz0             = 0.0;
+        fix1             = 0.0;
+        fiy1             = 0.0;
+        fiz1             = 0.0;
+        fix2             = 0.0;
+        fiy2             = 0.0;
+        fiz2             = 0.0;
+        fix3             = 0.0;
+        fiy3             = 0.0;
+        fiz3             = 0.0;
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end; jidx++)
+        {
+            /* Get j neighbor index, and coordinate index */
+            jnr              = jjnr[jidx];
+            j_coord_offset   = DIM*jnr;
+
+            /* load j atom coordinates */
+            jx0              = x[j_coord_offset+DIM*0+XX];
+            jy0              = x[j_coord_offset+DIM*0+YY];
+            jz0              = x[j_coord_offset+DIM*0+ZZ];
+            jx1              = x[j_coord_offset+DIM*1+XX];
+            jy1              = x[j_coord_offset+DIM*1+YY];
+            jz1              = x[j_coord_offset+DIM*1+ZZ];
+            jx2              = x[j_coord_offset+DIM*2+XX];
+            jy2              = x[j_coord_offset+DIM*2+YY];
+            jz2              = x[j_coord_offset+DIM*2+ZZ];
+            jx3              = x[j_coord_offset+DIM*3+XX];
+            jy3              = x[j_coord_offset+DIM*3+YY];
+            jz3              = x[j_coord_offset+DIM*3+ZZ];
+
+            /* Calculate displacement vector */
+            dx00             = ix0 - jx0;
+            dy00             = iy0 - jy0;
+            dz00             = iz0 - jz0;
+            dx11             = ix1 - jx1;
+            dy11             = iy1 - jy1;
+            dz11             = iz1 - jz1;
+            dx12             = ix1 - jx2;
+            dy12             = iy1 - jy2;
+            dz12             = iz1 - jz2;
+            dx13             = ix1 - jx3;
+            dy13             = iy1 - jy3;
+            dz13             = iz1 - jz3;
+            dx21             = ix2 - jx1;
+            dy21             = iy2 - jy1;
+            dz21             = iz2 - jz1;
+            dx22             = ix2 - jx2;
+            dy22             = iy2 - jy2;
+            dz22             = iz2 - jz2;
+            dx23             = ix2 - jx3;
+            dy23             = iy2 - jy3;
+            dz23             = iz2 - jz3;
+            dx31             = ix3 - jx1;
+            dy31             = iy3 - jy1;
+            dz31             = iz3 - jz1;
+            dx32             = ix3 - jx2;
+            dy32             = iy3 - jy2;
+            dz32             = iz3 - jz2;
+            dx33             = ix3 - jx3;
+            dy33             = iy3 - jy3;
+            dz33             = iz3 - jz3;
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = dx00*dx00+dy00*dy00+dz00*dz00;
+            rsq11            = dx11*dx11+dy11*dy11+dz11*dz11;
+            rsq12            = dx12*dx12+dy12*dy12+dz12*dz12;
+            rsq13            = dx13*dx13+dy13*dy13+dz13*dz13;
+            rsq21            = dx21*dx21+dy21*dy21+dz21*dz21;
+            rsq22            = dx22*dx22+dy22*dy22+dz22*dz22;
+            rsq23            = dx23*dx23+dy23*dy23+dz23*dz23;
+            rsq31            = dx31*dx31+dy31*dy31+dz31*dz31;
+            rsq32            = dx32*dx32+dy32*dy32+dz32*dz32;
+            rsq33            = dx33*dx33+dy33*dy33+dz33*dz33;
+
+            rinv00           = gmx_invsqrt(rsq00);
+            rinv11           = gmx_invsqrt(rsq11);
+            rinv12           = gmx_invsqrt(rsq12);
+            rinv13           = gmx_invsqrt(rsq13);
+            rinv21           = gmx_invsqrt(rsq21);
+            rinv22           = gmx_invsqrt(rsq22);
+            rinv23           = gmx_invsqrt(rsq23);
+            rinv31           = gmx_invsqrt(rsq31);
+            rinv32           = gmx_invsqrt(rsq32);
+            rinv33           = gmx_invsqrt(rsq33);
+
+            rinvsq00         = rinv00*rinv00;
+            rinvsq11         = rinv11*rinv11;
+            rinvsq12         = rinv12*rinv12;
+            rinvsq13         = rinv13*rinv13;
+            rinvsq21         = rinv21*rinv21;
+            rinvsq22         = rinv22*rinv22;
+            rinvsq23         = rinv23*rinv23;
+            rinvsq31         = rinv31*rinv31;
+            rinvsq32         = rinv32*rinv32;
+            rinvsq33         = rinv33*rinv33;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = rsq00*rinv00;
+
+            rinvsix          = rinvsq00*rinvsq00*rinvsq00;
+            ewcljrsq         = ewclj2*rsq00;
+            exponent        = exp(-ewcljrsq);
+            poly             = exponent*(1.0 + ewcljrsq + ewcljrsq*ewcljrsq*0.5);
+            fvdw             = (((c12_00*rinvsix - c6_00 + c6grid_00*(1.0-poly))*rinvsix) - c6grid_00*(1.0/6.0)*exponent*ewclj6)*rinvsq00;
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx00;
+            ty               = fscal*dy00;
+            tz               = fscal*dz00;
+
+            /* Update vectorial force */
+            fix0            += tx;
+            fiy0            += ty;
+            fiz0            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = rsq11*rinv11;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r11*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq11*rinv11*(rinvsq11-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx11;
+            ty               = fscal*dy11;
+            tz               = fscal*dz11;
+
+            /* Update vectorial force */
+            fix1            += tx;
+            fiy1            += ty;
+            fiz1            += tz;
+            f[j_coord_offset+DIM*1+XX] -= tx;
+            f[j_coord_offset+DIM*1+YY] -= ty;
+            f[j_coord_offset+DIM*1+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = rsq12*rinv12;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r12*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq12*rinv12*(rinvsq12-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx12;
+            ty               = fscal*dy12;
+            tz               = fscal*dz12;
+
+            /* Update vectorial force */
+            fix1            += tx;
+            fiy1            += ty;
+            fiz1            += tz;
+            f[j_coord_offset+DIM*2+XX] -= tx;
+            f[j_coord_offset+DIM*2+YY] -= ty;
+            f[j_coord_offset+DIM*2+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = rsq13*rinv13;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r13*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq13*rinv13*(rinvsq13-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx13;
+            ty               = fscal*dy13;
+            tz               = fscal*dz13;
+
+            /* Update vectorial force */
+            fix1            += tx;
+            fiy1            += ty;
+            fiz1            += tz;
+            f[j_coord_offset+DIM*3+XX] -= tx;
+            f[j_coord_offset+DIM*3+YY] -= ty;
+            f[j_coord_offset+DIM*3+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = rsq21*rinv21;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r21*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq21*rinv21*(rinvsq21-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx21;
+            ty               = fscal*dy21;
+            tz               = fscal*dz21;
+
+            /* Update vectorial force */
+            fix2            += tx;
+            fiy2            += ty;
+            fiz2            += tz;
+            f[j_coord_offset+DIM*1+XX] -= tx;
+            f[j_coord_offset+DIM*1+YY] -= ty;
+            f[j_coord_offset+DIM*1+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = rsq22*rinv22;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r22*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq22*rinv22*(rinvsq22-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx22;
+            ty               = fscal*dy22;
+            tz               = fscal*dz22;
+
+            /* Update vectorial force */
+            fix2            += tx;
+            fiy2            += ty;
+            fiz2            += tz;
+            f[j_coord_offset+DIM*2+XX] -= tx;
+            f[j_coord_offset+DIM*2+YY] -= ty;
+            f[j_coord_offset+DIM*2+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = rsq23*rinv23;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r23*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq23*rinv23*(rinvsq23-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx23;
+            ty               = fscal*dy23;
+            tz               = fscal*dz23;
+
+            /* Update vectorial force */
+            fix2            += tx;
+            fiy2            += ty;
+            fiz2            += tz;
+            f[j_coord_offset+DIM*3+XX] -= tx;
+            f[j_coord_offset+DIM*3+YY] -= ty;
+            f[j_coord_offset+DIM*3+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = rsq31*rinv31;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r31*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq31*rinv31*(rinvsq31-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx31;
+            ty               = fscal*dy31;
+            tz               = fscal*dz31;
+
+            /* Update vectorial force */
+            fix3            += tx;
+            fiy3            += ty;
+            fiz3            += tz;
+            f[j_coord_offset+DIM*1+XX] -= tx;
+            f[j_coord_offset+DIM*1+YY] -= ty;
+            f[j_coord_offset+DIM*1+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = rsq32*rinv32;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r32*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq32*rinv32*(rinvsq32-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx32;
+            ty               = fscal*dy32;
+            tz               = fscal*dz32;
+
+            /* Update vectorial force */
+            fix3            += tx;
+            fiy3            += ty;
+            fiz3            += tz;
+            f[j_coord_offset+DIM*2+XX] -= tx;
+            f[j_coord_offset+DIM*2+YY] -= ty;
+            f[j_coord_offset+DIM*2+ZZ] -= tz;
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = rsq33*rinv33;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = r33*ewtabscale;
+            ewitab           = ewrt;
+            eweps            = ewrt-ewitab;
+            felec            = (1.0-eweps)*ewtab[ewitab]+eweps*ewtab[ewitab+1];
+            felec            = qq33*rinv33*(rinvsq33-felec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx33;
+            ty               = fscal*dy33;
+            tz               = fscal*dz33;
+
+            /* Update vectorial force */
+            fix3            += tx;
+            fiy3            += ty;
+            fiz3            += tz;
+            f[j_coord_offset+DIM*3+XX] -= tx;
+            f[j_coord_offset+DIM*3+YY] -= ty;
+            f[j_coord_offset+DIM*3+ZZ] -= tz;
+
+            /* Inner loop uses 341 flops */
+        }
+        /* End of innermost loop */
+
+        tx = ty = tz = 0;
+        f[i_coord_offset+DIM*0+XX] += fix0;
+        f[i_coord_offset+DIM*0+YY] += fiy0;
+        f[i_coord_offset+DIM*0+ZZ] += fiz0;
+        tx                         += fix0;
+        ty                         += fiy0;
+        tz                         += fiz0;
+        f[i_coord_offset+DIM*1+XX] += fix1;
+        f[i_coord_offset+DIM*1+YY] += fiy1;
+        f[i_coord_offset+DIM*1+ZZ] += fiz1;
+        tx                         += fix1;
+        ty                         += fiy1;
+        tz                         += fiz1;
+        f[i_coord_offset+DIM*2+XX] += fix2;
+        f[i_coord_offset+DIM*2+YY] += fiy2;
+        f[i_coord_offset+DIM*2+ZZ] += fiz2;
+        tx                         += fix2;
+        ty                         += fiy2;
+        tz                         += fiz2;
+        f[i_coord_offset+DIM*3+XX] += fix3;
+        f[i_coord_offset+DIM*3+YY] += fiy3;
+        f[i_coord_offset+DIM*3+ZZ] += fiz3;
+        tx                         += fix3;
+        ty                         += fiy3;
+        tz                         += fiz3;
+        fshift[i_shift_offset+XX]  += tx;
+        fshift[i_shift_offset+YY]  += ty;
+        fshift[i_shift_offset+ZZ]  += tz;
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 39 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*39 + inneriter*341);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_c.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_c.c
new file mode 100644 (file)
index 0000000..74542cc
--- /dev/null
@@ -0,0 +1,431 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS c kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_c
+ * Electrostatics interaction: None
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_c
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    int              i_shift_offset,i_coord_offset,j_coord_offset;
+    int              j_index_start,j_index_end;
+    int              nri,inr,ggid,iidx,jidx,jnr,outeriter,inneriter;
+    real             shX,shY,shZ,tx,ty,tz,fscal,rcutoff,rcutoff2;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             *shiftvec,*fshift,*x,*f;
+    int              vdwioffset0;
+    real             ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0;
+    real             jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    real             dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00,cexp1_00,cexp2_00;
+    int              nvdwtype;
+    real             rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,br,vvdwexp,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    real             c6grid_00;
+    real             ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,sh_lj_ewald;
+    real            *vdwgridparam;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    ewclj            = fr->ewaldcoeff_lj;
+    sh_lj_ewald             = fr->ic->sh_lj_ewald;
+    ewclj2           = ewclj*ewclj;
+    ewclj6           = ewclj2*ewclj2*ewclj2;
+
+    rcutoff          = fr->rvdw;
+    rcutoff2         = rcutoff*rcutoff;
+
+    sh_vdw_invrcut6  = fr->ic->sh_invrc6;
+    rvdw             = fr->rvdw;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = shX + x[i_coord_offset+DIM*0+XX];
+        iy0              = shY + x[i_coord_offset+DIM*0+YY];
+        iz0              = shZ + x[i_coord_offset+DIM*0+ZZ];
+
+        fix0             = 0.0;
+        fiy0             = 0.0;
+        fiz0             = 0.0;
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        vvdwsum          = 0.0;
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end; jidx++)
+        {
+            /* Get j neighbor index, and coordinate index */
+            jnr              = jjnr[jidx];
+            j_coord_offset   = DIM*jnr;
+
+            /* load j atom coordinates */
+            jx0              = x[j_coord_offset+DIM*0+XX];
+            jy0              = x[j_coord_offset+DIM*0+YY];
+            jz0              = x[j_coord_offset+DIM*0+ZZ];
+
+            /* Calculate displacement vector */
+            dx00             = ix0 - jx0;
+            dy00             = iy0 - jy0;
+            dz00             = iz0 - jz0;
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = dx00*dx00+dy00*dy00+dz00*dz00;
+
+            rinv00           = gmx_invsqrt(rsq00);
+
+            rinvsq00         = rinv00*rinv00;
+
+            /* Load parameters for j particles */
+            vdwjidx0         = 2*vdwtype[jnr+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq00<rcutoff2)
+            {
+
+            r00              = rsq00*rinv00;
+
+            c6_00            = vdwparam[vdwioffset0+vdwjidx0];
+            c12_00           = vdwparam[vdwioffset0+vdwjidx0+1];
+            c6grid_00        = vdwgridparam[vdwioffset0+vdwjidx0];
+
+            rinvsix          = rinvsq00*rinvsq00*rinvsq00;
+            ewcljrsq         = ewclj2*rsq00;
+            exponent        = exp(-ewcljrsq);
+            poly             = exponent*(1.0 + ewcljrsq + ewcljrsq*ewcljrsq*0.5);
+            vvdw6            = (c6_00-c6grid_00*(1.0-poly))*rinvsix;
+            vvdw12           = c12_00*rinvsix*rinvsix;
+            vvdw             = (vvdw12 - c12_00*sh_vdw_invrcut6*sh_vdw_invrcut6)*(1.0/12.0) - (vvdw6 - c6_00*sh_vdw_invrcut6 - c6grid_00*sh_lj_ewald)*(1.0/6.0);
+            fvdw             = (vvdw12 - vvdw6 - c6grid_00*(1.0/6.0)*exponent*ewclj6)*rinvsq00;
+
+            /* Update potential sums from outer loop */
+            vvdwsum         += vvdw;
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx00;
+            ty               = fscal*dy00;
+            tz               = fscal*dz00;
+
+            /* Update vectorial force */
+            fix0            += tx;
+            fiy0            += ty;
+            fiz0            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            }
+
+            /* Inner loop uses 55 flops */
+        }
+        /* End of innermost loop */
+
+        tx = ty = tz = 0;
+        f[i_coord_offset+DIM*0+XX] += fix0;
+        f[i_coord_offset+DIM*0+YY] += fiy0;
+        f[i_coord_offset+DIM*0+ZZ] += fiz0;
+        tx                         += fix0;
+        ty                         += fiy0;
+        tz                         += fiz0;
+        fshift[i_shift_offset+XX]  += tx;
+        fshift[i_shift_offset+YY]  += ty;
+        fshift[i_shift_offset+ZZ]  += tz;
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        kernel_data->energygrp_vdw[ggid] += vvdwsum;
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 13 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*13 + inneriter*55);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_c
+ * Electrostatics interaction: None
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_c
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    int              i_shift_offset,i_coord_offset,j_coord_offset;
+    int              j_index_start,j_index_end;
+    int              nri,inr,ggid,iidx,jidx,jnr,outeriter,inneriter;
+    real             shX,shY,shZ,tx,ty,tz,fscal,rcutoff,rcutoff2;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             *shiftvec,*fshift,*x,*f;
+    int              vdwioffset0;
+    real             ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0;
+    real             jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    real             dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00,cexp1_00,cexp2_00;
+    int              nvdwtype;
+    real             rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,br,vvdwexp,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    real             c6grid_00;
+    real             ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,sh_lj_ewald;
+    real            *vdwgridparam;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    ewclj            = fr->ewaldcoeff_lj;
+    sh_lj_ewald             = fr->ic->sh_lj_ewald;
+    ewclj2           = ewclj*ewclj;
+    ewclj6           = ewclj2*ewclj2*ewclj2;
+
+    rcutoff          = fr->rvdw;
+    rcutoff2         = rcutoff*rcutoff;
+
+    sh_vdw_invrcut6  = fr->ic->sh_invrc6;
+    rvdw             = fr->rvdw;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = shX + x[i_coord_offset+DIM*0+XX];
+        iy0              = shY + x[i_coord_offset+DIM*0+YY];
+        iz0              = shZ + x[i_coord_offset+DIM*0+ZZ];
+
+        fix0             = 0.0;
+        fiy0             = 0.0;
+        fiz0             = 0.0;
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end; jidx++)
+        {
+            /* Get j neighbor index, and coordinate index */
+            jnr              = jjnr[jidx];
+            j_coord_offset   = DIM*jnr;
+
+            /* load j atom coordinates */
+            jx0              = x[j_coord_offset+DIM*0+XX];
+            jy0              = x[j_coord_offset+DIM*0+YY];
+            jz0              = x[j_coord_offset+DIM*0+ZZ];
+
+            /* Calculate displacement vector */
+            dx00             = ix0 - jx0;
+            dy00             = iy0 - jy0;
+            dz00             = iz0 - jz0;
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = dx00*dx00+dy00*dy00+dz00*dz00;
+
+            rinv00           = gmx_invsqrt(rsq00);
+
+            rinvsq00         = rinv00*rinv00;
+
+            /* Load parameters for j particles */
+            vdwjidx0         = 2*vdwtype[jnr+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (rsq00<rcutoff2)
+            {
+
+            r00              = rsq00*rinv00;
+
+            c6_00            = vdwparam[vdwioffset0+vdwjidx0];
+            c12_00           = vdwparam[vdwioffset0+vdwjidx0+1];
+            c6grid_00        = vdwgridparam[vdwioffset0+vdwjidx0];
+
+            rinvsix          = rinvsq00*rinvsq00*rinvsq00;
+            ewcljrsq         = ewclj2*rsq00;
+            exponent        = exp(-ewcljrsq);
+            poly             = exponent*(1.0 + ewcljrsq + ewcljrsq*ewcljrsq*0.5);
+            fvdw             = (((c12_00*rinvsix - c6_00 + c6grid_00*(1.0-poly))*rinvsix) - c6grid_00*(1.0/6.0)*exponent*ewclj6)*rinvsq00;
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx00;
+            ty               = fscal*dy00;
+            tz               = fscal*dz00;
+
+            /* Update vectorial force */
+            fix0            += tx;
+            fiy0            += ty;
+            fiz0            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            }
+
+            /* Inner loop uses 44 flops */
+        }
+        /* End of innermost loop */
+
+        tx = ty = tz = 0;
+        f[i_coord_offset+DIM*0+XX] += fix0;
+        f[i_coord_offset+DIM*0+YY] += fiy0;
+        f[i_coord_offset+DIM*0+ZZ] += fiz0;
+        tx                         += fix0;
+        ty                         += fiy0;
+        tz                         += fiz0;
+        fshift[i_shift_offset+XX]  += tx;
+        fshift[i_shift_offset+YY]  += ty;
+        fshift[i_shift_offset+ZZ]  += tz;
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 12 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*12 + inneriter*44);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecNone_VdwLJEw_GeomP1P1_c.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_ElecNone_VdwLJEw_GeomP1P1_c.c
new file mode 100644 (file)
index 0000000..7f7a2b0
--- /dev/null
@@ -0,0 +1,409 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS c kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_c
+ * Electrostatics interaction: None
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_c
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    int              i_shift_offset,i_coord_offset,j_coord_offset;
+    int              j_index_start,j_index_end;
+    int              nri,inr,ggid,iidx,jidx,jnr,outeriter,inneriter;
+    real             shX,shY,shZ,tx,ty,tz,fscal,rcutoff,rcutoff2;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             *shiftvec,*fshift,*x,*f;
+    int              vdwioffset0;
+    real             ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0;
+    real             jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    real             dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00,cexp1_00,cexp2_00;
+    int              nvdwtype;
+    real             rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,br,vvdwexp,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    real             c6grid_00;
+    real             ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,sh_lj_ewald;
+    real            *vdwgridparam;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    ewclj            = fr->ewaldcoeff_lj;
+    sh_lj_ewald             = fr->ic->sh_lj_ewald;
+    ewclj2           = ewclj*ewclj;
+    ewclj6           = ewclj2*ewclj2*ewclj2;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = shX + x[i_coord_offset+DIM*0+XX];
+        iy0              = shY + x[i_coord_offset+DIM*0+YY];
+        iz0              = shZ + x[i_coord_offset+DIM*0+ZZ];
+
+        fix0             = 0.0;
+        fiy0             = 0.0;
+        fiz0             = 0.0;
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        vvdwsum          = 0.0;
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end; jidx++)
+        {
+            /* Get j neighbor index, and coordinate index */
+            jnr              = jjnr[jidx];
+            j_coord_offset   = DIM*jnr;
+
+            /* load j atom coordinates */
+            jx0              = x[j_coord_offset+DIM*0+XX];
+            jy0              = x[j_coord_offset+DIM*0+YY];
+            jz0              = x[j_coord_offset+DIM*0+ZZ];
+
+            /* Calculate displacement vector */
+            dx00             = ix0 - jx0;
+            dy00             = iy0 - jy0;
+            dz00             = iz0 - jz0;
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = dx00*dx00+dy00*dy00+dz00*dz00;
+
+            rinv00           = gmx_invsqrt(rsq00);
+
+            rinvsq00         = rinv00*rinv00;
+
+            /* Load parameters for j particles */
+            vdwjidx0         = 2*vdwtype[jnr+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = rsq00*rinv00;
+
+            c6_00            = vdwparam[vdwioffset0+vdwjidx0];
+            c12_00           = vdwparam[vdwioffset0+vdwjidx0+1];
+            c6grid_00        = vdwgridparam[vdwioffset0+vdwjidx0];
+
+            rinvsix          = rinvsq00*rinvsq00*rinvsq00;
+            ewcljrsq         = ewclj2*rsq00;
+            exponent        = exp(-ewcljrsq);
+            poly             = exponent*(1.0 + ewcljrsq + ewcljrsq*ewcljrsq*0.5);
+            vvdw6            = (c6_00-c6grid_00*(1.0-poly))*rinvsix;
+            vvdw12           = c12_00*rinvsix*rinvsix;
+            vvdw             = vvdw12*(1.0/12.0) - vvdw6*(1.0/6.0);
+            fvdw             = (vvdw12 - vvdw6 - c6grid_00*(1.0/6.0)*exponent*ewclj6)*rinvsq00;
+
+            /* Update potential sums from outer loop */
+            vvdwsum         += vvdw;
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx00;
+            ty               = fscal*dy00;
+            tz               = fscal*dz00;
+
+            /* Update vectorial force */
+            fix0            += tx;
+            fiy0            += ty;
+            fiz0            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            /* Inner loop uses 49 flops */
+        }
+        /* End of innermost loop */
+
+        tx = ty = tz = 0;
+        f[i_coord_offset+DIM*0+XX] += fix0;
+        f[i_coord_offset+DIM*0+YY] += fiy0;
+        f[i_coord_offset+DIM*0+ZZ] += fiz0;
+        tx                         += fix0;
+        ty                         += fiy0;
+        tz                         += fiz0;
+        fshift[i_shift_offset+XX]  += tx;
+        fshift[i_shift_offset+YY]  += ty;
+        fshift[i_shift_offset+ZZ]  += tz;
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        kernel_data->energygrp_vdw[ggid] += vvdwsum;
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 13 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*13 + inneriter*49);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_c
+ * Electrostatics interaction: None
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_c
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    int              i_shift_offset,i_coord_offset,j_coord_offset;
+    int              j_index_start,j_index_end;
+    int              nri,inr,ggid,iidx,jidx,jnr,outeriter,inneriter;
+    real             shX,shY,shZ,tx,ty,tz,fscal,rcutoff,rcutoff2;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             *shiftvec,*fshift,*x,*f;
+    int              vdwioffset0;
+    real             ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0;
+    real             jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    real             dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00,cexp1_00,cexp2_00;
+    int              nvdwtype;
+    real             rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,br,vvdwexp,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    real             c6grid_00;
+    real             ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,sh_lj_ewald;
+    real            *vdwgridparam;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    ewclj            = fr->ewaldcoeff_lj;
+    sh_lj_ewald             = fr->ic->sh_lj_ewald;
+    ewclj2           = ewclj*ewclj;
+    ewclj6           = ewclj2*ewclj2*ewclj2;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+        shX              = shiftvec[i_shift_offset+XX];
+        shY              = shiftvec[i_shift_offset+YY];
+        shZ              = shiftvec[i_shift_offset+ZZ];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        ix0              = shX + x[i_coord_offset+DIM*0+XX];
+        iy0              = shY + x[i_coord_offset+DIM*0+YY];
+        iz0              = shZ + x[i_coord_offset+DIM*0+ZZ];
+
+        fix0             = 0.0;
+        fiy0             = 0.0;
+        fiz0             = 0.0;
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end; jidx++)
+        {
+            /* Get j neighbor index, and coordinate index */
+            jnr              = jjnr[jidx];
+            j_coord_offset   = DIM*jnr;
+
+            /* load j atom coordinates */
+            jx0              = x[j_coord_offset+DIM*0+XX];
+            jy0              = x[j_coord_offset+DIM*0+YY];
+            jz0              = x[j_coord_offset+DIM*0+ZZ];
+
+            /* Calculate displacement vector */
+            dx00             = ix0 - jx0;
+            dy00             = iy0 - jy0;
+            dz00             = iz0 - jz0;
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = dx00*dx00+dy00*dy00+dz00*dz00;
+
+            rinv00           = gmx_invsqrt(rsq00);
+
+            rinvsq00         = rinv00*rinv00;
+
+            /* Load parameters for j particles */
+            vdwjidx0         = 2*vdwtype[jnr+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = rsq00*rinv00;
+
+            c6_00            = vdwparam[vdwioffset0+vdwjidx0];
+            c12_00           = vdwparam[vdwioffset0+vdwjidx0+1];
+            c6grid_00        = vdwgridparam[vdwioffset0+vdwjidx0];
+
+            rinvsix          = rinvsq00*rinvsq00*rinvsq00;
+            ewcljrsq         = ewclj2*rsq00;
+            exponent        = exp(-ewcljrsq);
+            poly             = exponent*(1.0 + ewcljrsq + ewcljrsq*ewcljrsq*0.5);
+            fvdw             = (((c12_00*rinvsix - c6_00 + c6grid_00*(1.0-poly))*rinvsix) - c6grid_00*(1.0/6.0)*exponent*ewclj6)*rinvsq00;
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = fscal*dx00;
+            ty               = fscal*dy00;
+            tz               = fscal*dz00;
+
+            /* Update vectorial force */
+            fix0            += tx;
+            fiy0            += ty;
+            fiz0            += tz;
+            f[j_coord_offset+DIM*0+XX] -= tx;
+            f[j_coord_offset+DIM*0+YY] -= ty;
+            f[j_coord_offset+DIM*0+ZZ] -= tz;
+
+            /* Inner loop uses 44 flops */
+        }
+        /* End of innermost loop */
+
+        tx = ty = tz = 0;
+        f[i_coord_offset+DIM*0+XX] += fix0;
+        f[i_coord_offset+DIM*0+YY] += fiy0;
+        f[i_coord_offset+DIM*0+ZZ] += fiz0;
+        tx                         += fix0;
+        ty                         += fiy0;
+        tz                         += fiz0;
+        fshift[i_shift_offset+XX]  += tx;
+        fshift[i_shift_offset+YY]  += ty;
+        fshift[i_shift_offset+ZZ]  += tz;
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 12 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*12 + inneriter*44);
+}
index 0d470ffc7d4227ceb2a6cadbd99988d09ace22c6..dab6b617a0704392d60372fc61801137f7623533 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
 
 #include "../nb_kernel.h"
 
+nb_kernel_t nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_c;
+nb_kernel_t nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_c;
+nb_kernel_t nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_c;
+nb_kernel_t nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_c;
 nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_c;
 nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_c;
 nb_kernel_t nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_c;
@@ -54,6 +58,16 @@ nb_kernel_t nb_kernel_ElecNone_VdwBhamSh_GeomP1P1_VF_c;
 nb_kernel_t nb_kernel_ElecNone_VdwBhamSh_GeomP1P1_F_c;
 nb_kernel_t nb_kernel_ElecNone_VdwBhamSw_GeomP1P1_VF_c;
 nb_kernel_t nb_kernel_ElecNone_VdwBhamSw_GeomP1P1_F_c;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_c;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_c;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_c;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_c;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_c;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_c;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_c;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_c;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_c;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_c;
 nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_c;
 nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_c;
 nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_c;
@@ -94,6 +108,16 @@ nb_kernel_t nb_kernel_ElecEw_VdwBham_GeomW4P1_VF_c;
 nb_kernel_t nb_kernel_ElecEw_VdwBham_GeomW4P1_F_c;
 nb_kernel_t nb_kernel_ElecEw_VdwBham_GeomW4W4_VF_c;
 nb_kernel_t nb_kernel_ElecEw_VdwBham_GeomW4W4_F_c;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_c;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_c;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_c;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_c;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_c;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_c;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_c;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_c;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_c;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_c;
 nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_c;
 nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_c;
 nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_c;
@@ -347,6 +371,10 @@ nb_kernel_t nb_kernel_ElecRF_VdwBham_GeomW4W4_F_c;
 nb_kernel_info_t
     kernellist_c[] =
 {
+    { nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_c, "nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_c", "c", "None", "None", "LJEwald", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_c, "nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_c", "c", "None", "None", "LJEwald", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_c, "nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_c", "c", "None", "None", "LJEwald", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_c, "nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_c", "c", "None", "None", "LJEwald", "PotentialShift", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_c, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_c", "c", "None", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_c, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_c", "c", "None", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_c, "nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_c", "c", "None", "None", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
@@ -361,6 +389,16 @@ nb_kernel_info_t
     { nb_kernel_ElecNone_VdwBhamSh_GeomP1P1_F_c, "nb_kernel_ElecNone_VdwBhamSh_GeomP1P1_F_c", "c", "None", "None", "Buckingham", "PotentialShift", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecNone_VdwBhamSw_GeomP1P1_VF_c, "nb_kernel_ElecNone_VdwBhamSw_GeomP1P1_VF_c", "c", "None", "None", "Buckingham", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecNone_VdwBhamSw_GeomP1P1_F_c, "nb_kernel_ElecNone_VdwBhamSw_GeomP1P1_F_c", "c", "None", "None", "Buckingham", "PotentialSwitch", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_c, "nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_c", "c", "Ewald", "None", "LJEwald", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_c, "nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_c", "c", "Ewald", "None", "LJEwald", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_c, "nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_c", "c", "Ewald", "None", "LJEwald", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_c, "nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_c", "c", "Ewald", "None", "LJEwald", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_c, "nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_c", "c", "Ewald", "None", "LJEwald", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_c, "nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_c", "c", "Ewald", "None", "LJEwald", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_c, "nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_c", "c", "Ewald", "None", "LJEwald", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_c, "nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_c", "c", "Ewald", "None", "LJEwald", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_c, "nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_c", "c", "Ewald", "None", "LJEwald", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_c, "nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_c", "c", "Ewald", "None", "LJEwald", "None", "Water4Water4", "", "Force" },
     { nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_c, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_c", "c", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_c, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_c", "c", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_c, "nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_c", "c", "Ewald", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
@@ -401,6 +439,16 @@ nb_kernel_info_t
     { nb_kernel_ElecEw_VdwBham_GeomW4P1_F_c, "nb_kernel_ElecEw_VdwBham_GeomW4P1_F_c", "c", "Ewald", "None", "Buckingham", "None", "Water4Particle", "", "Force" },
     { nb_kernel_ElecEw_VdwBham_GeomW4W4_VF_c, "nb_kernel_ElecEw_VdwBham_GeomW4W4_VF_c", "c", "Ewald", "None", "Buckingham", "None", "Water4Water4", "", "PotentialAndForce" },
     { nb_kernel_ElecEw_VdwBham_GeomW4W4_F_c, "nb_kernel_ElecEw_VdwBham_GeomW4W4_F_c", "c", "Ewald", "None", "Buckingham", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_c, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_c", "c", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_c, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_c", "c", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_c, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_c", "c", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_c, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_c", "c", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_c, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_c", "c", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_c, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_c", "c", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_c, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_c", "c", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_c, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_c", "c", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_c, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_c", "c", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_c, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_c", "c", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water4Water4", "", "Force" },
     { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_c, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_c", "c", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_c, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_c", "c", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_c, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_c", "c", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
index 30bc74efdbee92368706b1df0341832212a22bc3..b59399eb041f96078d48bfca992074409fad86cc 100644 (file)
@@ -2,7 +2,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -161,6 +161,13 @@ void
     real             rt,vfeps,vftabscale,Y,F,Geps,Heps2,Fp,VV,FF;
     real             *vftab;
     /* #endif */
+    /* #if 'LJEwald' in KERNEL_VDW */
+    /* #for I,J in PAIRS_IJ */
+    real             c6grid_{I}{J};
+    /* #endfor */
+    real             ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,sh_lj_ewald;
+    real            *vdwgridparam;
+    /* #endif */
     /* #if 'Ewald' in KERNEL_ELEC */
     int              ewitab;
     real             ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace;
@@ -195,6 +202,13 @@ void
     vdwparam         = fr->nbfp;
     vdwtype          = mdatoms->typeA;
     /* #endif */
+    /* #if 'LJEwald' in KERNEL_VDW */
+    vdwgridparam     = fr->ljpme_c6grid;
+    ewclj            = fr->ewaldcoeff_lj;
+    sh_lj_ewald             = fr->ic->sh_lj_ewald;
+    ewclj2           = ewclj*ewclj;
+    ewclj6           = ewclj2*ewclj2*ewclj2;
+    /* #endif */
 
     /* #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW */
     vftab            = kernel_data->table_elec_vdw->data;
@@ -255,6 +269,10 @@ void
     c6_{I}{J}            = vdwparam[vdwioffset{I}+vdwjidx{J}];
     cexp1_{I}{J}         = vdwparam[vdwioffset{I}+vdwjidx{J}+1];
     cexp2_{I}{J}         = vdwparam[vdwioffset{I}+vdwjidx{J}+2];
+    /*             #elif 'LJEwald' in KERNEL_VDW */
+    c6_{I}{J}            = vdwparam[vdwioffset{I}+vdwjidx{J}];
+    c12_{I}{J}           = vdwparam[vdwioffset{I}+vdwjidx{J}+1];
+    c6grid_{I}{J}        = vdwgridparam[vdwioffset{I}+vdwjidx{J}];
     /*             #else */
     c6_{I}{J}            = vdwparam[vdwioffset{I}+vdwjidx{J}];
     c12_{I}{J}           = vdwparam[vdwioffset{I}+vdwjidx{J}+1];
@@ -459,6 +477,10 @@ void
             c6_{I}{J}            = vdwparam[vdwioffset{I}+vdwjidx{J}];
             cexp1_{I}{J}         = vdwparam[vdwioffset{I}+vdwjidx{J}+1];
             cexp2_{I}{J}         = vdwparam[vdwioffset{I}+vdwjidx{J}+2];
+           /*             #elif 'LJEwald' in KERNEL_VDW */
+            c6_{I}{J}            = vdwparam[vdwioffset{I}+vdwjidx{J}];
+            c12_{I}{J}           = vdwparam[vdwioffset{I}+vdwjidx{J}+1];
+            c6grid_{I}{J}        = vdwgridparam[vdwioffset{I}+vdwjidx{J}];
             /*             #else */
             c6_{I}{J}            = vdwparam[vdwioffset{I}+vdwjidx{J}];
             c12_{I}{J}           = vdwparam[vdwioffset{I}+vdwjidx{J}+1];
@@ -699,6 +721,35 @@ void
             fvdw             = -(fvdw6+fvdw12)*vftabscale*rinv{I}{J};
             /*                 #define INNERFLOPS INNERFLOPS+4 */
             /*             #endif */
+
+            /*         #elif KERNEL_VDW=='LJEwald' */
+
+            rinvsix          = rinvsq{I}{J}*rinvsq{I}{J}*rinvsq{I}{J};
+            ewcljrsq         = ewclj2*rsq{I}{J};
+            exponent        = exp(-ewcljrsq);
+            poly             = exponent*(1.0 + ewcljrsq + ewcljrsq*ewcljrsq*0.5);
+            /*             #define INNERFLOPS INNERFLOPS+9 */
+            /*             #if 'Potential' in KERNEL_VF or KERNEL_MOD_VDW=='PotentialSwitch' */
+            vvdw6            = (c6_{I}{J}-c6grid_{I}{J}*(1.0-poly))*rinvsix;
+            vvdw12           = c12_{I}{J}*rinvsix*rinvsix;
+            /*                 #define INNERFLOPS INNERFLOPS+6 */
+            /*                 #if KERNEL_MOD_VDW=='PotentialShift' */
+            vvdw             = (vvdw12 - c12_{I}{J}*sh_vdw_invrcut6*sh_vdw_invrcut6)*(1.0/12.0) - (vvdw6 - c6_{I}{J}*sh_vdw_invrcut6 - c6grid_{I}{J}*sh_lj_ewald)*(1.0/6.0);
+            /*                     #define INNERFLOPS INNERFLOPS+9 */
+            /*                 #else */
+            vvdw             = vvdw12*(1.0/12.0) - vvdw6*(1.0/6.0);
+            /*                     #define INNERFLOPS INNERFLOPS+3 */
+            /*                 #endif */
+            /*                 ## Check for force inside potential check, i.e. this means we already did the potential part */
+            /*                 #if 'Force' in KERNEL_VF */
+            fvdw             = (vvdw12 - vvdw6 - c6grid_{I}{J}*(1.0/6.0)*exponent*ewclj6)*rinvsq{I}{J};
+            /*                     #define INNERFLOPS INNERFLOPS+6 */
+            /*                 #endif */
+            /*             #elif KERNEL_VF=='Force' */
+            /*                 ## Force-only LennardJones makes it possible to save 1 flop (they do add up...) */
+            fvdw             = (((c12_{I}{J}*rinvsix - c6_{I}{J} + c6grid_{I}{J}*(1.0-poly))*rinvsix) - c6grid_{I}{J}*(1.0/6.0)*exponent*ewclj6)*rinvsq{I}{J};
+            /*                 #define INNERFLOPS INNERFLOPS+11 */
+            /*             #endif */
             /*         #endif */
             /*         ## End of check for vdw interaction forms */
             /*     #endif */
index 72deaf013d729cb241c5c135bfb26f7b409a3f82..b344ebfb61448df1e3d298250d3b6875785dd185 100755 (executable)
@@ -2,7 +2,7 @@
 #
 # This file is part of the GROMACS molecular simulation package.
 #
-# Copyright (c) 2012,2013, by the GROMACS development team, led by
+# Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
 # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 # and including many others, as listed in the AUTHORS file in the
 # top-level source directory and at http://www.gromacs.org.
@@ -120,6 +120,7 @@ VdwList = {
     'LennardJones'            : ['rinvsq'],
 #    'Buckingham'              : ['rinv','rinvsq','r'], # Disabled for sse4.1 to reduce number of kernels and simply the template
     'CubicSplineTable'        : ['rinv','r','table'],
+    'LJEwald'                 : ['rinv','rinvsq','r'],
 }
 
 
@@ -193,6 +194,7 @@ Abbreviation = {
     'CubicSplineTable'        : 'CSTab',
     'LennardJones'            : 'LJ',
     'Buckingham'              : 'Bham',
+    'LJEwald'                 : 'LJEw',
     'PotentialShift'          : 'Sh',
     'PotentialSwitch'         : 'Sw',
     'ExactCutoff'             : 'Cut',
@@ -282,7 +284,11 @@ def KeepKernel(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,Kernel
         return 0
     # For Vdw, we support switch and shift for Lennard-Jones/Buckingham
     if((KernelVdwMod=='ExactCutoff') or
-       (KernelVdwMod in ['PotentialShift','PotentialSwitch'] and KernelVdw not in ['LennardJones','Buckingham'])):
+       (KernelVdwMod in ['PotentialShift','PotentialSwitch'] and KernelVdw not in ['LennardJones','Buckingham','LJEwald'])):
+        return 0
+
+    # For LJEwald, we only support shift
+    if(KernelVdw=='LJEwald' and KernelVdwMod=='PotentialSwitch'):
         return 0
 
     # Choose either switch or shift and don't mix them...
@@ -302,6 +308,10 @@ def KeepKernel(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,Kernel
         elif(KernelElec!='ReactionField'):
             return 0
 
+    #Only do LJ-PME if we are also doing PME for electrostatics, or no electrostatics at all.
+    if(KernelVdw=='LJEwald' and KernelElec not in ['Ewald','None']):
+        return 0
+
     return 1
 
 
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..2367168
--- /dev/null
@@ -0,0 +1,730 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8           c6grid_00;
+    real                 *vdwgridparam;
+    _fjsp_v2r8           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    _fjsp_v2r8           one_half = gmx_fjsp_set1_v2r8(0.5);
+    _fjsp_v2r8           minus_one = gmx_fjsp_set1_v2r8(-1.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = gmx_fjsp_set1_v2r8(fr->ic->sh_lj_ewald);
+    ewclj            = gmx_fjsp_set1_v2r8(fr->ewaldcoeff_lj);
+    ewclj2           = _fjsp_mul_v2r8(minus_one,_fjsp_mul_v2r8(ewclj,ewclj));
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_fjsp_load_2real_swizzle_v2r8(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                                   vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _fjsp_mul_v2r8(_fjsp_madd_v2r8(-c6grid_00,_fjsp_sub_v2r8(one,poly),c6_00),rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                               _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw6,_fjsp_madd_v2r8(c6grid_00,sh_lj_ewald,_fjsp_mul_v2r8(c6_00,sh_vdw_invrcut6))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _fjsp_mul_v2r8(_fjsp_add_v2r8(vvdw12,_fjsp_msub_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 79 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_fjsp_load_1real_swizzle_v2r8(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _fjsp_mul_v2r8(_fjsp_madd_v2r8(-c6grid_00,_fjsp_sub_v2r8(one,poly),c6_00),rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                               _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw6,_fjsp_madd_v2r8(c6grid_00,sh_lj_ewald,_fjsp_mul_v2r8(c6_00,sh_vdw_invrcut6))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _fjsp_mul_v2r8(_fjsp_add_v2r8(vvdw12,_fjsp_msub_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 79 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*79);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8           c6grid_00;
+    real                 *vdwgridparam;
+    _fjsp_v2r8           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    _fjsp_v2r8           one_half = gmx_fjsp_set1_v2r8(0.5);
+    _fjsp_v2r8           minus_one = gmx_fjsp_set1_v2r8(-1.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = gmx_fjsp_set1_v2r8(fr->ic->sh_lj_ewald);
+    ewclj            = gmx_fjsp_set1_v2r8(fr->ewaldcoeff_lj);
+    ewclj2           = _fjsp_mul_v2r8(minus_one,_fjsp_mul_v2r8(ewclj,ewclj));
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_fjsp_load_2real_swizzle_v2r8(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                                   vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _fjsp_mul_v2r8(c6grid_00,_fjsp_msub_v2r8(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _fjsp_mul_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+            fvdw              = _fjsp_mul_v2r8(_fjsp_madd_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,_fjsp_sub_v2r8(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 64 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_fjsp_load_1real_swizzle_v2r8(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _fjsp_mul_v2r8(c6grid_00,_fjsp_msub_v2r8(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _fjsp_mul_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+            fvdw              = _fjsp_mul_v2r8(_fjsp_madd_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,_fjsp_sub_v2r8(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 64 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*64);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..9420865
--- /dev/null
@@ -0,0 +1,1230 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8           c6grid_00;
+    _fjsp_v2r8           c6grid_10;
+    _fjsp_v2r8           c6grid_20;
+    real                 *vdwgridparam;
+    _fjsp_v2r8           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    _fjsp_v2r8           one_half = gmx_fjsp_set1_v2r8(0.5);
+    _fjsp_v2r8           minus_one = gmx_fjsp_set1_v2r8(-1.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = gmx_fjsp_set1_v2r8(fr->ic->sh_lj_ewald);
+    ewclj            = gmx_fjsp_set1_v2r8(fr->ewaldcoeff_lj);
+    ewclj2           = _fjsp_mul_v2r8(minus_one,_fjsp_mul_v2r8(ewclj,ewclj));
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_fjsp_load_2real_swizzle_v2r8(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                                   vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _fjsp_mul_v2r8(_fjsp_madd_v2r8(-c6grid_00,_fjsp_sub_v2r8(one,poly),c6_00),rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                               _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw6,_fjsp_madd_v2r8(c6grid_00,sh_lj_ewald,_fjsp_mul_v2r8(c6_00,sh_vdw_invrcut6))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _fjsp_mul_v2r8(_fjsp_add_v2r8(vvdw12,_fjsp_msub_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 180 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_fjsp_load_1real_swizzle_v2r8(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _fjsp_mul_v2r8(_fjsp_madd_v2r8(-c6grid_00,_fjsp_sub_v2r8(one,poly),c6_00),rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                               _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw6,_fjsp_madd_v2r8(c6grid_00,sh_lj_ewald,_fjsp_mul_v2r8(c6_00,sh_vdw_invrcut6))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _fjsp_mul_v2r8(_fjsp_add_v2r8(vvdw12,_fjsp_msub_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 180 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*180);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8           c6grid_00;
+    _fjsp_v2r8           c6grid_10;
+    _fjsp_v2r8           c6grid_20;
+    real                 *vdwgridparam;
+    _fjsp_v2r8           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    _fjsp_v2r8           one_half = gmx_fjsp_set1_v2r8(0.5);
+    _fjsp_v2r8           minus_one = gmx_fjsp_set1_v2r8(-1.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = gmx_fjsp_set1_v2r8(fr->ic->sh_lj_ewald);
+    ewclj            = gmx_fjsp_set1_v2r8(fr->ewaldcoeff_lj);
+    ewclj2           = _fjsp_mul_v2r8(minus_one,_fjsp_mul_v2r8(ewclj,ewclj));
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_fjsp_load_2real_swizzle_v2r8(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                                   vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _fjsp_mul_v2r8(c6grid_00,_fjsp_msub_v2r8(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _fjsp_mul_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+            fvdw              = _fjsp_mul_v2r8(_fjsp_madd_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,_fjsp_sub_v2r8(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 151 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_fjsp_load_1real_swizzle_v2r8(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _fjsp_mul_v2r8(c6grid_00,_fjsp_msub_v2r8(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _fjsp_mul_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+            fvdw              = _fjsp_mul_v2r8(_fjsp_madd_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,_fjsp_sub_v2r8(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 151 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*151);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..84da2d8
--- /dev/null
@@ -0,0 +1,2474 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8           c6grid_00;
+    _fjsp_v2r8           c6grid_01;
+    _fjsp_v2r8           c6grid_02;
+    _fjsp_v2r8           c6grid_10;
+    _fjsp_v2r8           c6grid_11;
+    _fjsp_v2r8           c6grid_12;
+    _fjsp_v2r8           c6grid_20;
+    _fjsp_v2r8           c6grid_21;
+    _fjsp_v2r8           c6grid_22;
+    real                 *vdwgridparam;
+    _fjsp_v2r8           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    _fjsp_v2r8           one_half = gmx_fjsp_set1_v2r8(0.5);
+    _fjsp_v2r8           minus_one = gmx_fjsp_set1_v2r8(-1.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = gmx_fjsp_set1_v2r8(fr->ic->sh_lj_ewald);
+    ewclj            = gmx_fjsp_set1_v2r8(fr->ewaldcoeff_lj);
+    ewclj2           = _fjsp_mul_v2r8(minus_one,_fjsp_mul_v2r8(ewclj,ewclj));
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = gmx_fjsp_set1_v2r8(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _fjsp_mul_v2r8(_fjsp_madd_v2r8(-c6grid_00,_fjsp_sub_v2r8(one,poly),c6_00),rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                               _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw6,_fjsp_madd_v2r8(c6grid_00,sh_lj_ewald,_fjsp_mul_v2r8(c6_00,sh_vdw_invrcut6))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _fjsp_mul_v2r8(_fjsp_add_v2r8(vvdw12,_fjsp_msub_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv01,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv02,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv11,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv12,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv21,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv22,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 471 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _fjsp_mul_v2r8(_fjsp_madd_v2r8(-c6grid_00,_fjsp_sub_v2r8(one,poly),c6_00),rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                               _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw6,_fjsp_madd_v2r8(c6grid_00,sh_lj_ewald,_fjsp_mul_v2r8(c6_00,sh_vdw_invrcut6))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _fjsp_mul_v2r8(_fjsp_add_v2r8(vvdw12,_fjsp_msub_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv01,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv02,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv11,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv12,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv21,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv22,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 471 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*471);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8           c6grid_00;
+    _fjsp_v2r8           c6grid_01;
+    _fjsp_v2r8           c6grid_02;
+    _fjsp_v2r8           c6grid_10;
+    _fjsp_v2r8           c6grid_11;
+    _fjsp_v2r8           c6grid_12;
+    _fjsp_v2r8           c6grid_20;
+    _fjsp_v2r8           c6grid_21;
+    _fjsp_v2r8           c6grid_22;
+    real                 *vdwgridparam;
+    _fjsp_v2r8           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    _fjsp_v2r8           one_half = gmx_fjsp_set1_v2r8(0.5);
+    _fjsp_v2r8           minus_one = gmx_fjsp_set1_v2r8(-1.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = gmx_fjsp_set1_v2r8(fr->ic->sh_lj_ewald);
+    ewclj            = gmx_fjsp_set1_v2r8(fr->ewaldcoeff_lj);
+    ewclj2           = _fjsp_mul_v2r8(minus_one,_fjsp_mul_v2r8(ewclj,ewclj));
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = gmx_fjsp_set1_v2r8(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _fjsp_mul_v2r8(c6grid_00,_fjsp_msub_v2r8(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _fjsp_mul_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+            fvdw              = _fjsp_mul_v2r8(_fjsp_madd_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,_fjsp_sub_v2r8(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 400 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _fjsp_mul_v2r8(c6grid_00,_fjsp_msub_v2r8(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _fjsp_mul_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+            fvdw              = _fjsp_mul_v2r8(_fjsp_madd_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,_fjsp_sub_v2r8(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 400 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*400);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..33a8e26
--- /dev/null
@@ -0,0 +1,1388 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8           c6grid_00;
+    _fjsp_v2r8           c6grid_10;
+    _fjsp_v2r8           c6grid_20;
+    _fjsp_v2r8           c6grid_30;
+    real                 *vdwgridparam;
+    _fjsp_v2r8           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    _fjsp_v2r8           one_half = gmx_fjsp_set1_v2r8(0.5);
+    _fjsp_v2r8           minus_one = gmx_fjsp_set1_v2r8(-1.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = gmx_fjsp_set1_v2r8(fr->ic->sh_lj_ewald);
+    ewclj            = gmx_fjsp_set1_v2r8(fr->ewaldcoeff_lj);
+    ewclj2           = _fjsp_mul_v2r8(minus_one,_fjsp_mul_v2r8(ewclj,ewclj));
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_fjsp_load_2real_swizzle_v2r8(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                                   vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _fjsp_mul_v2r8(_fjsp_madd_v2r8(-c6grid_00,_fjsp_sub_v2r8(one,poly),c6_00),rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                               _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw6,_fjsp_madd_v2r8(c6grid_00,sh_lj_ewald,_fjsp_mul_v2r8(c6_00,sh_vdw_invrcut6))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _fjsp_mul_v2r8(_fjsp_add_v2r8(vvdw12,_fjsp_msub_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv30,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 209 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_fjsp_load_1real_swizzle_v2r8(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _fjsp_mul_v2r8(_fjsp_madd_v2r8(-c6grid_00,_fjsp_sub_v2r8(one,poly),c6_00),rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                               _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw6,_fjsp_madd_v2r8(c6grid_00,sh_lj_ewald,_fjsp_mul_v2r8(c6_00,sh_vdw_invrcut6))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _fjsp_mul_v2r8(_fjsp_add_v2r8(vvdw12,_fjsp_msub_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv30,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 209 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*209);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8           c6grid_00;
+    _fjsp_v2r8           c6grid_10;
+    _fjsp_v2r8           c6grid_20;
+    _fjsp_v2r8           c6grid_30;
+    real                 *vdwgridparam;
+    _fjsp_v2r8           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    _fjsp_v2r8           one_half = gmx_fjsp_set1_v2r8(0.5);
+    _fjsp_v2r8           minus_one = gmx_fjsp_set1_v2r8(-1.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = gmx_fjsp_set1_v2r8(fr->ic->sh_lj_ewald);
+    ewclj            = gmx_fjsp_set1_v2r8(fr->ewaldcoeff_lj);
+    ewclj2           = _fjsp_mul_v2r8(minus_one,_fjsp_mul_v2r8(ewclj,ewclj));
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_fjsp_load_2real_swizzle_v2r8(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                                   vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _fjsp_mul_v2r8(c6grid_00,_fjsp_msub_v2r8(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _fjsp_mul_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+            fvdw              = _fjsp_mul_v2r8(_fjsp_madd_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,_fjsp_sub_v2r8(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 180 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_fjsp_load_1real_swizzle_v2r8(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _fjsp_mul_v2r8(c6grid_00,_fjsp_msub_v2r8(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _fjsp_mul_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+            fvdw              = _fjsp_mul_v2r8(_fjsp_madd_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,_fjsp_sub_v2r8(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 180 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*180);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..9ae76e1
--- /dev/null
@@ -0,0 +1,2644 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8           c6grid_00;
+    _fjsp_v2r8           c6grid_11;
+    _fjsp_v2r8           c6grid_12;
+    _fjsp_v2r8           c6grid_13;
+    _fjsp_v2r8           c6grid_21;
+    _fjsp_v2r8           c6grid_22;
+    _fjsp_v2r8           c6grid_23;
+    _fjsp_v2r8           c6grid_31;
+    _fjsp_v2r8           c6grid_32;
+    _fjsp_v2r8           c6grid_33;
+    real                 *vdwgridparam;
+    _fjsp_v2r8           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    _fjsp_v2r8           one_half = gmx_fjsp_set1_v2r8(0.5);
+    _fjsp_v2r8           minus_one = gmx_fjsp_set1_v2r8(-1.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = gmx_fjsp_set1_v2r8(fr->ic->sh_lj_ewald);
+    ewclj            = gmx_fjsp_set1_v2r8(fr->ewaldcoeff_lj);
+    ewclj2           = _fjsp_mul_v2r8(minus_one,_fjsp_mul_v2r8(ewclj,ewclj));
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = gmx_fjsp_set1_v2r8(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _fjsp_mul_v2r8(_fjsp_madd_v2r8(-c6grid_00,_fjsp_sub_v2r8(one,poly),c6_00),rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                               _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw6,_fjsp_madd_v2r8(c6grid_00,sh_lj_ewald,_fjsp_mul_v2r8(c6_00,sh_vdw_invrcut6))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _fjsp_mul_v2r8(_fjsp_add_v2r8(vvdw12,_fjsp_msub_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv11,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv12,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv13,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv21,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv22,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv23,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv31,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv32,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv33,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 503 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _fjsp_mul_v2r8(_fjsp_madd_v2r8(-c6grid_00,_fjsp_sub_v2r8(one,poly),c6_00),rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                               _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw6,_fjsp_madd_v2r8(c6grid_00,sh_lj_ewald,_fjsp_mul_v2r8(c6_00,sh_vdw_invrcut6))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _fjsp_mul_v2r8(_fjsp_add_v2r8(vvdw12,_fjsp_msub_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv11,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv12,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv13,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv21,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv22,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv23,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv31,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv32,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv33,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 503 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*503);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8           c6grid_00;
+    _fjsp_v2r8           c6grid_11;
+    _fjsp_v2r8           c6grid_12;
+    _fjsp_v2r8           c6grid_13;
+    _fjsp_v2r8           c6grid_21;
+    _fjsp_v2r8           c6grid_22;
+    _fjsp_v2r8           c6grid_23;
+    _fjsp_v2r8           c6grid_31;
+    _fjsp_v2r8           c6grid_32;
+    _fjsp_v2r8           c6grid_33;
+    real                 *vdwgridparam;
+    _fjsp_v2r8           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    _fjsp_v2r8           one_half = gmx_fjsp_set1_v2r8(0.5);
+    _fjsp_v2r8           minus_one = gmx_fjsp_set1_v2r8(-1.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = gmx_fjsp_set1_v2r8(fr->ic->sh_lj_ewald);
+    ewclj            = gmx_fjsp_set1_v2r8(fr->ewaldcoeff_lj);
+    ewclj2           = _fjsp_mul_v2r8(minus_one,_fjsp_mul_v2r8(ewclj,ewclj));
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = gmx_fjsp_set1_v2r8(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _fjsp_mul_v2r8(c6grid_00,_fjsp_msub_v2r8(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _fjsp_mul_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+            fvdw              = _fjsp_mul_v2r8(_fjsp_madd_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,_fjsp_sub_v2r8(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 432 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _fjsp_mul_v2r8(c6grid_00,_fjsp_msub_v2r8(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _fjsp_mul_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+            fvdw              = _fjsp_mul_v2r8(_fjsp_madd_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,_fjsp_sub_v2r8(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 432 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*432);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJEw_GeomP1P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJEw_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..38a7346
--- /dev/null
@@ -0,0 +1,672 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8           c6grid_00;
+    real                 *vdwgridparam;
+    _fjsp_v2r8           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    _fjsp_v2r8           one_half = gmx_fjsp_set1_v2r8(0.5);
+    _fjsp_v2r8           minus_one = gmx_fjsp_set1_v2r8(-1.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = gmx_fjsp_set1_v2r8(fr->ic->sh_lj_ewald);
+    ewclj            = gmx_fjsp_set1_v2r8(fr->ewaldcoeff_lj);
+    ewclj2           = _fjsp_mul_v2r8(minus_one,_fjsp_mul_v2r8(ewclj,ewclj));
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_fjsp_load_2real_swizzle_v2r8(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                                   vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _fjsp_mul_v2r8(_fjsp_madd_v2r8(-c6grid_00,_fjsp_sub_v2r8(one,poly),c6_00),rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+           vvdw             = _fjsp_msub_v2r8(vvdw12,one_twelfth,_fjsp_mul_v2r8(vvdw6,one_sixth));         
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _fjsp_mul_v2r8(_fjsp_add_v2r8(vvdw12,_fjsp_msub_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 68 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_fjsp_load_1real_swizzle_v2r8(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _fjsp_mul_v2r8(_fjsp_madd_v2r8(-c6grid_00,_fjsp_sub_v2r8(one,poly),c6_00),rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+           vvdw             = _fjsp_msub_v2r8(vvdw12,one_twelfth,_fjsp_mul_v2r8(vvdw6,one_sixth));         
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _fjsp_mul_v2r8(_fjsp_add_v2r8(vvdw12,_fjsp_msub_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 68 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*68);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8           c6grid_00;
+    real                 *vdwgridparam;
+    _fjsp_v2r8           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    _fjsp_v2r8           one_half = gmx_fjsp_set1_v2r8(0.5);
+    _fjsp_v2r8           minus_one = gmx_fjsp_set1_v2r8(-1.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = gmx_fjsp_set1_v2r8(fr->ic->sh_lj_ewald);
+    ewclj            = gmx_fjsp_set1_v2r8(fr->ewaldcoeff_lj);
+    ewclj2           = _fjsp_mul_v2r8(minus_one,_fjsp_mul_v2r8(ewclj,ewclj));
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_fjsp_load_2real_swizzle_v2r8(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                                   vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _fjsp_mul_v2r8(c6grid_00,_fjsp_msub_v2r8(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _fjsp_mul_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+            fvdw              = _fjsp_mul_v2r8(_fjsp_madd_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,_fjsp_sub_v2r8(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 61 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_fjsp_load_1real_swizzle_v2r8(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _fjsp_mul_v2r8(c6grid_00,_fjsp_msub_v2r8(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _fjsp_mul_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+            fvdw              = _fjsp_mul_v2r8(_fjsp_madd_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,_fjsp_sub_v2r8(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 61 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*61);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJEw_GeomW3P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJEw_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..c1e9a2b
--- /dev/null
@@ -0,0 +1,1096 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8           c6grid_00;
+    _fjsp_v2r8           c6grid_10;
+    _fjsp_v2r8           c6grid_20;
+    real                 *vdwgridparam;
+    _fjsp_v2r8           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    _fjsp_v2r8           one_half = gmx_fjsp_set1_v2r8(0.5);
+    _fjsp_v2r8           minus_one = gmx_fjsp_set1_v2r8(-1.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = gmx_fjsp_set1_v2r8(fr->ic->sh_lj_ewald);
+    ewclj            = gmx_fjsp_set1_v2r8(fr->ewaldcoeff_lj);
+    ewclj2           = _fjsp_mul_v2r8(minus_one,_fjsp_mul_v2r8(ewclj,ewclj));
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_fjsp_load_2real_swizzle_v2r8(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                                   vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _fjsp_mul_v2r8(_fjsp_madd_v2r8(-c6grid_00,_fjsp_sub_v2r8(one,poly),c6_00),rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+           vvdw             = _fjsp_msub_v2r8(vvdw12,one_twelfth,_fjsp_mul_v2r8(vvdw6,one_sixth));         
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _fjsp_mul_v2r8(_fjsp_add_v2r8(vvdw12,_fjsp_msub_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 159 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_fjsp_load_1real_swizzle_v2r8(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _fjsp_mul_v2r8(_fjsp_madd_v2r8(-c6grid_00,_fjsp_sub_v2r8(one,poly),c6_00),rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+           vvdw             = _fjsp_msub_v2r8(vvdw12,one_twelfth,_fjsp_mul_v2r8(vvdw6,one_sixth));         
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _fjsp_mul_v2r8(_fjsp_add_v2r8(vvdw12,_fjsp_msub_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 159 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*159);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8           c6grid_00;
+    _fjsp_v2r8           c6grid_10;
+    _fjsp_v2r8           c6grid_20;
+    real                 *vdwgridparam;
+    _fjsp_v2r8           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    _fjsp_v2r8           one_half = gmx_fjsp_set1_v2r8(0.5);
+    _fjsp_v2r8           minus_one = gmx_fjsp_set1_v2r8(-1.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = gmx_fjsp_set1_v2r8(fr->ic->sh_lj_ewald);
+    ewclj            = gmx_fjsp_set1_v2r8(fr->ewaldcoeff_lj);
+    ewclj2           = _fjsp_mul_v2r8(minus_one,_fjsp_mul_v2r8(ewclj,ewclj));
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_fjsp_load_2real_swizzle_v2r8(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                                   vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _fjsp_mul_v2r8(c6grid_00,_fjsp_msub_v2r8(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _fjsp_mul_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+            fvdw              = _fjsp_mul_v2r8(_fjsp_madd_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,_fjsp_sub_v2r8(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 142 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_fjsp_load_1real_swizzle_v2r8(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _fjsp_mul_v2r8(c6grid_00,_fjsp_msub_v2r8(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _fjsp_mul_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+            fvdw              = _fjsp_mul_v2r8(_fjsp_madd_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,_fjsp_sub_v2r8(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 142 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*142);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJEw_GeomW3W3_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJEw_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..49bf255
--- /dev/null
@@ -0,0 +1,2112 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8           c6grid_00;
+    _fjsp_v2r8           c6grid_01;
+    _fjsp_v2r8           c6grid_02;
+    _fjsp_v2r8           c6grid_10;
+    _fjsp_v2r8           c6grid_11;
+    _fjsp_v2r8           c6grid_12;
+    _fjsp_v2r8           c6grid_20;
+    _fjsp_v2r8           c6grid_21;
+    _fjsp_v2r8           c6grid_22;
+    real                 *vdwgridparam;
+    _fjsp_v2r8           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    _fjsp_v2r8           one_half = gmx_fjsp_set1_v2r8(0.5);
+    _fjsp_v2r8           minus_one = gmx_fjsp_set1_v2r8(-1.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = gmx_fjsp_set1_v2r8(fr->ic->sh_lj_ewald);
+    ewclj            = gmx_fjsp_set1_v2r8(fr->ewaldcoeff_lj);
+    ewclj2           = _fjsp_mul_v2r8(minus_one,_fjsp_mul_v2r8(ewclj,ewclj));
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = gmx_fjsp_set1_v2r8(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _fjsp_mul_v2r8(_fjsp_madd_v2r8(-c6grid_00,_fjsp_sub_v2r8(one,poly),c6_00),rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+           vvdw             = _fjsp_msub_v2r8(vvdw12,one_twelfth,_fjsp_mul_v2r8(vvdw6,one_sixth));         
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _fjsp_mul_v2r8(_fjsp_add_v2r8(vvdw12,_fjsp_msub_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 420 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _fjsp_mul_v2r8(_fjsp_madd_v2r8(-c6grid_00,_fjsp_sub_v2r8(one,poly),c6_00),rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+           vvdw             = _fjsp_msub_v2r8(vvdw12,one_twelfth,_fjsp_mul_v2r8(vvdw6,one_sixth));         
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _fjsp_mul_v2r8(_fjsp_add_v2r8(vvdw12,_fjsp_msub_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 420 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*420);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8           c6grid_00;
+    _fjsp_v2r8           c6grid_01;
+    _fjsp_v2r8           c6grid_02;
+    _fjsp_v2r8           c6grid_10;
+    _fjsp_v2r8           c6grid_11;
+    _fjsp_v2r8           c6grid_12;
+    _fjsp_v2r8           c6grid_20;
+    _fjsp_v2r8           c6grid_21;
+    _fjsp_v2r8           c6grid_22;
+    real                 *vdwgridparam;
+    _fjsp_v2r8           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    _fjsp_v2r8           one_half = gmx_fjsp_set1_v2r8(0.5);
+    _fjsp_v2r8           minus_one = gmx_fjsp_set1_v2r8(-1.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = gmx_fjsp_set1_v2r8(fr->ic->sh_lj_ewald);
+    ewclj            = gmx_fjsp_set1_v2r8(fr->ewaldcoeff_lj);
+    ewclj2           = _fjsp_mul_v2r8(minus_one,_fjsp_mul_v2r8(ewclj,ewclj));
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = gmx_fjsp_set1_v2r8(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _fjsp_mul_v2r8(c6grid_00,_fjsp_msub_v2r8(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _fjsp_mul_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+            fvdw              = _fjsp_mul_v2r8(_fjsp_madd_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,_fjsp_sub_v2r8(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 373 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _fjsp_mul_v2r8(c6grid_00,_fjsp_msub_v2r8(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _fjsp_mul_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+            fvdw              = _fjsp_mul_v2r8(_fjsp_madd_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,_fjsp_sub_v2r8(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 373 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*373);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJEw_GeomW4P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJEw_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..193ab17
--- /dev/null
@@ -0,0 +1,1218 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8           c6grid_00;
+    _fjsp_v2r8           c6grid_10;
+    _fjsp_v2r8           c6grid_20;
+    _fjsp_v2r8           c6grid_30;
+    real                 *vdwgridparam;
+    _fjsp_v2r8           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    _fjsp_v2r8           one_half = gmx_fjsp_set1_v2r8(0.5);
+    _fjsp_v2r8           minus_one = gmx_fjsp_set1_v2r8(-1.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = gmx_fjsp_set1_v2r8(fr->ic->sh_lj_ewald);
+    ewclj            = gmx_fjsp_set1_v2r8(fr->ewaldcoeff_lj);
+    ewclj2           = _fjsp_mul_v2r8(minus_one,_fjsp_mul_v2r8(ewclj,ewclj));
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_fjsp_load_2real_swizzle_v2r8(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                                   vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _fjsp_mul_v2r8(_fjsp_madd_v2r8(-c6grid_00,_fjsp_sub_v2r8(one,poly),c6_00),rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+           vvdw             = _fjsp_msub_v2r8(vvdw12,one_twelfth,_fjsp_mul_v2r8(vvdw6,one_sixth));         
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _fjsp_mul_v2r8(_fjsp_add_v2r8(vvdw12,_fjsp_msub_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 185 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_fjsp_load_1real_swizzle_v2r8(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _fjsp_mul_v2r8(_fjsp_madd_v2r8(-c6grid_00,_fjsp_sub_v2r8(one,poly),c6_00),rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+           vvdw             = _fjsp_msub_v2r8(vvdw12,one_twelfth,_fjsp_mul_v2r8(vvdw6,one_sixth));         
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _fjsp_mul_v2r8(_fjsp_add_v2r8(vvdw12,_fjsp_msub_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 185 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*185);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8           c6grid_00;
+    _fjsp_v2r8           c6grid_10;
+    _fjsp_v2r8           c6grid_20;
+    _fjsp_v2r8           c6grid_30;
+    real                 *vdwgridparam;
+    _fjsp_v2r8           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    _fjsp_v2r8           one_half = gmx_fjsp_set1_v2r8(0.5);
+    _fjsp_v2r8           minus_one = gmx_fjsp_set1_v2r8(-1.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = gmx_fjsp_set1_v2r8(fr->ic->sh_lj_ewald);
+    ewclj            = gmx_fjsp_set1_v2r8(fr->ewaldcoeff_lj);
+    ewclj2           = _fjsp_mul_v2r8(minus_one,_fjsp_mul_v2r8(ewclj,ewclj));
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_fjsp_load_2real_swizzle_v2r8(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                                   vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _fjsp_mul_v2r8(c6grid_00,_fjsp_msub_v2r8(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _fjsp_mul_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+            fvdw              = _fjsp_mul_v2r8(_fjsp_madd_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,_fjsp_sub_v2r8(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 168 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_fjsp_load_1real_swizzle_v2r8(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _fjsp_mul_v2r8(c6grid_00,_fjsp_msub_v2r8(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _fjsp_mul_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+            fvdw              = _fjsp_mul_v2r8(_fjsp_madd_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,_fjsp_sub_v2r8(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 168 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*168);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJEw_GeomW4W4_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJEw_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..df41d3c
--- /dev/null
@@ -0,0 +1,2246 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8           c6grid_00;
+    _fjsp_v2r8           c6grid_11;
+    _fjsp_v2r8           c6grid_12;
+    _fjsp_v2r8           c6grid_13;
+    _fjsp_v2r8           c6grid_21;
+    _fjsp_v2r8           c6grid_22;
+    _fjsp_v2r8           c6grid_23;
+    _fjsp_v2r8           c6grid_31;
+    _fjsp_v2r8           c6grid_32;
+    _fjsp_v2r8           c6grid_33;
+    real                 *vdwgridparam;
+    _fjsp_v2r8           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    _fjsp_v2r8           one_half = gmx_fjsp_set1_v2r8(0.5);
+    _fjsp_v2r8           minus_one = gmx_fjsp_set1_v2r8(-1.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = gmx_fjsp_set1_v2r8(fr->ic->sh_lj_ewald);
+    ewclj            = gmx_fjsp_set1_v2r8(fr->ewaldcoeff_lj);
+    ewclj2           = _fjsp_mul_v2r8(minus_one,_fjsp_mul_v2r8(ewclj,ewclj));
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = gmx_fjsp_set1_v2r8(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _fjsp_mul_v2r8(_fjsp_madd_v2r8(-c6grid_00,_fjsp_sub_v2r8(one,poly),c6_00),rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+           vvdw             = _fjsp_msub_v2r8(vvdw12,one_twelfth,_fjsp_mul_v2r8(vvdw6,one_sixth));         
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _fjsp_mul_v2r8(_fjsp_add_v2r8(vvdw12,_fjsp_msub_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 449 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _fjsp_mul_v2r8(_fjsp_madd_v2r8(-c6grid_00,_fjsp_sub_v2r8(one,poly),c6_00),rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+           vvdw             = _fjsp_msub_v2r8(vvdw12,one_twelfth,_fjsp_mul_v2r8(vvdw6,one_sixth));         
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _fjsp_mul_v2r8(_fjsp_add_v2r8(vvdw12,_fjsp_msub_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 449 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*449);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8           c6grid_00;
+    _fjsp_v2r8           c6grid_11;
+    _fjsp_v2r8           c6grid_12;
+    _fjsp_v2r8           c6grid_13;
+    _fjsp_v2r8           c6grid_21;
+    _fjsp_v2r8           c6grid_22;
+    _fjsp_v2r8           c6grid_23;
+    _fjsp_v2r8           c6grid_31;
+    _fjsp_v2r8           c6grid_32;
+    _fjsp_v2r8           c6grid_33;
+    real                 *vdwgridparam;
+    _fjsp_v2r8           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    _fjsp_v2r8           one_half = gmx_fjsp_set1_v2r8(0.5);
+    _fjsp_v2r8           minus_one = gmx_fjsp_set1_v2r8(-1.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = gmx_fjsp_set1_v2r8(fr->ic->sh_lj_ewald);
+    ewclj            = gmx_fjsp_set1_v2r8(fr->ewaldcoeff_lj);
+    ewclj2           = _fjsp_mul_v2r8(minus_one,_fjsp_mul_v2r8(ewclj,ewclj));
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = gmx_fjsp_set1_v2r8(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _fjsp_mul_v2r8(c6grid_00,_fjsp_msub_v2r8(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _fjsp_mul_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+            fvdw              = _fjsp_mul_v2r8(_fjsp_madd_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,_fjsp_sub_v2r8(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 402 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _fjsp_mul_v2r8(c6grid_00,_fjsp_msub_v2r8(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _fjsp_mul_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+            fvdw              = _fjsp_mul_v2r8(_fjsp_madd_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,_fjsp_sub_v2r8(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 402 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*402);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..8e1982d
--- /dev/null
@@ -0,0 +1,626 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: None
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8           c6grid_00;
+    real                 *vdwgridparam;
+    _fjsp_v2r8           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    _fjsp_v2r8           one_half = gmx_fjsp_set1_v2r8(0.5);
+    _fjsp_v2r8           minus_one = gmx_fjsp_set1_v2r8(-1.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = gmx_fjsp_set1_v2r8(fr->ic->sh_lj_ewald);
+    ewclj            = gmx_fjsp_set1_v2r8(fr->ewaldcoeff_lj);
+    ewclj2           = _fjsp_mul_v2r8(minus_one,_fjsp_mul_v2r8(ewclj,ewclj));
+
+    rcutoff_scalar   = fr->rvdw;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_fjsp_load_2real_swizzle_v2r8(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                                   vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _fjsp_mul_v2r8(_fjsp_madd_v2r8(-c6grid_00,_fjsp_sub_v2r8(one,poly),c6_00),rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                               _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw6,_fjsp_madd_v2r8(c6grid_00,sh_lj_ewald,_fjsp_mul_v2r8(c6_00,sh_vdw_invrcut6))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _fjsp_mul_v2r8(_fjsp_add_v2r8(vvdw12,_fjsp_msub_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 59 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_fjsp_load_1real_swizzle_v2r8(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _fjsp_mul_v2r8(_fjsp_madd_v2r8(-c6grid_00,_fjsp_sub_v2r8(one,poly),c6_00),rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                               _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw6,_fjsp_madd_v2r8(c6grid_00,sh_lj_ewald,_fjsp_mul_v2r8(c6_00,sh_vdw_invrcut6))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _fjsp_mul_v2r8(_fjsp_add_v2r8(vvdw12,_fjsp_msub_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 59 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*59);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: None
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8           c6grid_00;
+    real                 *vdwgridparam;
+    _fjsp_v2r8           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    _fjsp_v2r8           one_half = gmx_fjsp_set1_v2r8(0.5);
+    _fjsp_v2r8           minus_one = gmx_fjsp_set1_v2r8(-1.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = gmx_fjsp_set1_v2r8(fr->ic->sh_lj_ewald);
+    ewclj            = gmx_fjsp_set1_v2r8(fr->ewaldcoeff_lj);
+    ewclj2           = _fjsp_mul_v2r8(minus_one,_fjsp_mul_v2r8(ewclj,ewclj));
+
+    rcutoff_scalar   = fr->rvdw;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_fjsp_load_2real_swizzle_v2r8(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                                   vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _fjsp_mul_v2r8(c6grid_00,_fjsp_msub_v2r8(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _fjsp_mul_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+            fvdw              = _fjsp_mul_v2r8(_fjsp_madd_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,_fjsp_sub_v2r8(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 51 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_fjsp_load_1real_swizzle_v2r8(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _fjsp_mul_v2r8(c6grid_00,_fjsp_msub_v2r8(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _fjsp_mul_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+            fvdw              = _fjsp_mul_v2r8(_fjsp_madd_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,_fjsp_sub_v2r8(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 51 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 6 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*6 + inneriter*51);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecNone_VdwLJEw_GeomP1P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecNone_VdwLJEw_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..f797be7
--- /dev/null
@@ -0,0 +1,572 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: None
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8           c6grid_00;
+    real                 *vdwgridparam;
+    _fjsp_v2r8           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    _fjsp_v2r8           one_half = gmx_fjsp_set1_v2r8(0.5);
+    _fjsp_v2r8           minus_one = gmx_fjsp_set1_v2r8(-1.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = gmx_fjsp_set1_v2r8(fr->ic->sh_lj_ewald);
+    ewclj            = gmx_fjsp_set1_v2r8(fr->ewaldcoeff_lj);
+    ewclj2           = _fjsp_mul_v2r8(minus_one,_fjsp_mul_v2r8(ewclj,ewclj));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_fjsp_load_2real_swizzle_v2r8(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                                   vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _fjsp_mul_v2r8(_fjsp_madd_v2r8(-c6grid_00,_fjsp_sub_v2r8(one,poly),c6_00),rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+           vvdw             = _fjsp_msub_v2r8(vvdw12,one_twelfth,_fjsp_mul_v2r8(vvdw6,one_sixth));         
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _fjsp_mul_v2r8(_fjsp_add_v2r8(vvdw12,_fjsp_msub_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 50 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_fjsp_load_1real_swizzle_v2r8(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _fjsp_mul_v2r8(_fjsp_madd_v2r8(-c6grid_00,_fjsp_sub_v2r8(one,poly),c6_00),rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+           vvdw             = _fjsp_msub_v2r8(vvdw12,one_twelfth,_fjsp_mul_v2r8(vvdw6,one_sixth));         
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _fjsp_mul_v2r8(_fjsp_add_v2r8(vvdw12,_fjsp_msub_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6),vvdw6)),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 50 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*50);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: None
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8           c6grid_00;
+    real                 *vdwgridparam;
+    _fjsp_v2r8           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    _fjsp_v2r8           one_half = gmx_fjsp_set1_v2r8(0.5);
+    _fjsp_v2r8           minus_one = gmx_fjsp_set1_v2r8(-1.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = gmx_fjsp_set1_v2r8(fr->ic->sh_lj_ewald);
+    ewclj            = gmx_fjsp_set1_v2r8(fr->ewaldcoeff_lj);
+    ewclj2           = _fjsp_mul_v2r8(minus_one,_fjsp_mul_v2r8(ewclj,ewclj));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_fjsp_load_2real_swizzle_v2r8(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                                   vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _fjsp_mul_v2r8(c6grid_00,_fjsp_msub_v2r8(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _fjsp_mul_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+            fvdw              = _fjsp_mul_v2r8(_fjsp_madd_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,_fjsp_sub_v2r8(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 48 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_fjsp_load_1real_swizzle_v2r8(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq00);
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _fjsp_mul_v2r8(c6grid_00,_fjsp_msub_v2r8(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _fjsp_mul_v2r8(_fjsp_mul_v2r8(c6grid_00,one_sixth),_fjsp_mul_v2r8(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+            fvdw              = _fjsp_mul_v2r8(_fjsp_madd_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,_fjsp_sub_v2r8(c6_00,f6A)),rinvsix,f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 48 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 6 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*6 + inneriter*48);
+}
index d603feb1b87d3d1db134eb15362e59da3425b8f5..5055ed655ceb21b5848e1b6081c1b80915e14436 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
 
 #include "../nb_kernel.h"
 
+nb_kernel_t nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_sparc64_hpc_ace_double;
 nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double;
 nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double;
 nb_kernel_t nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double;
@@ -48,6 +52,16 @@ nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double;
 nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double;
 nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double;
 nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_sparc64_hpc_ace_double;
 nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double;
 nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double;
 nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double;
@@ -78,6 +92,16 @@ nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double;
 nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double;
 nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double;
 nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_sparc64_hpc_ace_double;
 nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double;
 nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double;
 nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sparc64_hpc_ace_double;
@@ -259,6 +283,10 @@ nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double;
 nb_kernel_info_t
     kernellist_sparc64_hpc_ace_double[] =
 {
+    { nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "None", "None", "LJEwald", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "None", "None", "LJEwald", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "None", "None", "LJEwald", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "None", "None", "LJEwald", "PotentialShift", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "None", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "None", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "None", "None", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
@@ -267,6 +295,16 @@ nb_kernel_info_t
     { nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "None", "None", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LJEwald", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LJEwald", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LJEwald", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LJEwald", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LJEwald", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LJEwald", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LJEwald", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LJEwald", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LJEwald", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LJEwald", "None", "Water4Water4", "", "Force" },
     { nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
@@ -297,6 +335,16 @@ nb_kernel_info_t
     { nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
     { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
     { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water4Water4", "", "Force" },
     { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
index 91b5068925230411864c669fb5f608a3c67b4555..df45424c023b9ad2d0fa685d5655bcc5741ed678 100644 (file)
@@ -2,7 +2,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -147,6 +147,15 @@ void
     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
     real             *vftab;
     /* #endif */
+    /* #if 'LJEwald' in KERNEL_VDW */
+    /* #for I,J in PAIRS_IJ */
+    _fjsp_v2r8           c6grid_{I}{J};
+    /* #endfor */
+    real                 *vdwgridparam;
+    _fjsp_v2r8           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    _fjsp_v2r8           one_half = gmx_fjsp_set1_v2r8(0.5);
+    _fjsp_v2r8           minus_one = gmx_fjsp_set1_v2r8(-1.0);
+    /* #endif */
     /* #if 'Ewald' in KERNEL_ELEC */
     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
     real             *ewtab;
@@ -186,6 +195,12 @@ void
     vdwparam         = fr->nbfp;
     vdwtype          = mdatoms->typeA;
     /* #endif */
+    /* #if 'LJEwald' in KERNEL_VDW */
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = gmx_fjsp_set1_v2r8(fr->ic->sh_lj_ewald);
+    ewclj            = gmx_fjsp_set1_v2r8(fr->ewaldcoeff_lj);
+    ewclj2           = _fjsp_mul_v2r8(minus_one,_fjsp_mul_v2r8(ewclj,ewclj));
+    /* #endif */
 
     /* #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW */
     vftab            = kernel_data->table_elec_vdw->data;
@@ -242,8 +257,14 @@ void
     qq{I}{J}             = _fjsp_mul_v2r8(iq{I},jq{J});
     /*         #endif */
     /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
+    /*             #if 'LJEwald' in KERNEL_VDW */
+    c6_{I}{J}            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset{I}+vdwjidx{J}A]);
+    c12_{I}{J}           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset{I}+vdwjidx{J}A+1]);
+    c6grid_{I}{J}        = gmx_fjsp_set1_v2r8(vdwgridparam[vdwioffset{I}+vdwjidx{J}A]);
+    /*             #else */
     c6_{I}{J}            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset{I}+vdwjidx{J}A]);
     c12_{I}{J}           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset{I}+vdwjidx{J}A+1]);
+    /*             #endif */
     /*         #endif */
     /*     #endfor */
     /* #endif */
@@ -525,8 +546,17 @@ void
             /*             #if ROUND == 'Loop' */
             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset{I}+vdwjidx{J}A,
                                          vdwparam+vdwioffset{I}+vdwjidx{J}B,&c6_{I}{J},&c12_{I}{J});
+
+           /*                 #if 'LJEwald' in KERNEL_VDW */
+            c6grid_{I}{J}       = gmx_fjsp_load_2real_swizzle_v2r8(vdwgridparam+vdwioffset{I}+vdwjidx{J}A,
+                                                                   vdwgridparam+vdwioffset{I}+vdwjidx{J}B);
+            /*                 #endif */
             /*             #else */
             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset{I}+vdwjidx{J}A,&c6_{I}{J},&c12_{I}{J});
+
+            /*                 #if 'LJEwald' in KERNEL_VDW */
+            c6grid_{I}{J}       = gmx_fjsp_load_1real_swizzle_v2r8(vdwgridparam+vdwioffset{I}+vdwjidx{J}A);
+            /*                 #endif */
             /*             #endif */
             /*         #endif */
             /*     #endif */
@@ -821,6 +851,45 @@ void
             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv{I}{J})));
             /*                 #define INNERFLOPS INNERFLOPS+4 */
             /*             #endif */
+
+            /*         #elif KERNEL_VDW=='LJEwald' */
+
+            /* Analytical LJ-PME */
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq{I}{J},rinvsq{I}{J}),rinvsq{I}{J});
+            ewcljrsq         = _fjsp_mul_v2r8(ewclj2,rsq{I}{J});
+            ewclj6           = _fjsp_mul_v2r8(ewclj2,_fjsp_mul_v2r8(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(-ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+           poly             = _fjsp_mul_v2r8(exponent,_fjsp_madd_v2r8(_fjsp_mul_v2r8(ewcljrsq,ewcljrsq),one_half,_fjsp_sub_v2r8(one,ewcljrsq)));
+            /*                 #define INNERFLOPS INNERFLOPS+9 */
+            /*             #if 'Potential' in KERNEL_VF or KERNEL_MOD_VDW=='PotentialSwitch' */
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _fjsp_mul_v2r8(_fjsp_madd_v2r8(-c6grid_{I}{J},_fjsp_sub_v2r8(one,poly),c6_{I}{J}),rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_{I}{J},_fjsp_mul_v2r8(rinvsix,rinvsix));
+            /*                 #define INNERFLOPS INNERFLOPS+5 */
+            /*                 #if KERNEL_MOD_VDW=='PotentialShift' */
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_{I}{J},_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                               _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw6,_fjsp_madd_v2r8(c6grid_{I}{J},sh_lj_ewald,_fjsp_mul_v2r8(c6_{I}{J},sh_vdw_invrcut6))),one_sixth));
+            /*                     #define INNERFLOPS INNERFLOPS+7 */
+            /*                 #else */
+           vvdw             = _fjsp_msub_v2r8(vvdw12,one_twelfth,_fjsp_mul_v2r8(vvdw6,one_sixth));         
+            /*                 #define INNERFLOPS INNERFLOPS+2 */
+           /*                 #endif */
+            /*                  ## Check for force inside potential check, i.e. this means we already did the potential part */
+            /*                  #if 'Force' in KERNEL_VF */
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+           fvdw             = _fjsp_mul_v2r8(_fjsp_add_v2r8(vvdw12,_fjsp_msub_v2r8(_fjsp_mul_v2r8(c6grid_{I}{J},one_sixth),_fjsp_mul_v2r8(exponent,ewclj6),vvdw6)),rinvsq{I}{J});
+            /*                 #define INNERFLOPS INNERFLOPS+6 */
+            /*                  #endif */
+            /*              #elif KERNEL_VF=='Force' */
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _fjsp_mul_v2r8(c6grid_{I}{J},_fjsp_msub_v2r8(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _fjsp_mul_v2r8(_fjsp_mul_v2r8(c6grid_{I}{J},one_sixth),_fjsp_mul_v2r8(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+            fvdw              = _fjsp_mul_v2r8(_fjsp_madd_v2r8(_fjsp_msub_v2r8(c12_{I}{J},rinvsix,_fjsp_sub_v2r8(c6_{I}{J},f6A)),rinvsix,f6B),rinvsq{I}{J});
+            /*                 #define INNERFLOPS INNERFLOPS+12 */
+            /*              #endif */
             /*         #endif */
             /*         ## End of check for vdw interaction forms */
             /*     #endif */
index 03e709a3f76047d136b2ae92feda91087b7a7e1f..cbe9ab78d073e090fe11148233422c1177fdd022 100755 (executable)
@@ -2,7 +2,7 @@
 #
 # This file is part of the GROMACS molecular simulation package.
 #
-# Copyright (c) 2012,2013, by the GROMACS development team, led by
+# Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
 # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 # and including many others, as listed in the AUTHORS file in the
 # top-level source directory and at http://www.gromacs.org.
@@ -120,6 +120,7 @@ VdwList = {
     'LennardJones'            : ['rinvsq'],
 #    'Buckingham'              : ['rinv','rinvsq','r'], # Disabled for sse2 to reduce number of kernels and simply the template 
     'CubicSplineTable'        : ['rinv','r','table'],
+    'LJEwald'                 : ['rinv','rinvsq','r'],
 }
 
 
@@ -193,6 +194,7 @@ Abbreviation = {
     'CubicSplineTable'        : 'CSTab',
     'LennardJones'            : 'LJ',
     'Buckingham'              : 'Bham',
+    'LJEwald'                 : 'LJEw',
     'PotentialShift'          : 'Sh',
     'PotentialSwitch'         : 'Sw',
     'ExactCutoff'             : 'Cut',
@@ -282,7 +284,11 @@ def KeepKernel(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,Kernel
         return 0
     # For Vdw, we support switch and shift for Lennard-Jones/Buckingham
     if((KernelVdwMod=='ExactCutoff') or
-       (KernelVdwMod in ['PotentialShift','PotentialSwitch'] and KernelVdw not in ['LennardJones','Buckingham'])):
+       (KernelVdwMod in ['PotentialShift','PotentialSwitch'] and KernelVdw not in ['LennardJones','Buckingham','LJEwald'])):
+        return 0
+
+    # For LJEwald, we only support shift
+    if(KernelVdw=='LJEwald' and KernelVdwMod=='PotentialSwitch'):
         return 0
 
     # Choose either switch or shift and don't mix them...
@@ -302,6 +308,10 @@ def KeepKernel(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,Kernel
         elif(KernelElec!='ReactionField'):
             return 0
 
+    #Only do LJ-PME if we are also doing PME for electrostatics, or no electrostatics at all.
+    if(KernelVdw=='LJEwald' and KernelElec not in ['Ewald','None']):
+        return 0
+
     return 1
 
 
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_sse2_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_sse2_double.c
new file mode 100644 (file)
index 0000000..fb8b0ba
--- /dev/null
@@ -0,0 +1,743 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse2_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse2_double.h"
+#include "kernelutil_x86_sse2_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_sse2_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_sse2_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_pd(facel,_mm_load1_pd(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_pd();
+        vvdwsum          = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq00,_mm_sub_pd(_mm_sub_pd(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
+                              _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_add_pd(_mm_mul_pd(c6_00,sh_vdw_invrcut6),_mm_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+            vvdw             = _mm_and_pd(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 82 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _mm_load_sd(charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq00,_mm_sub_pd(_mm_sub_pd(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
+                              _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_add_pd(_mm_mul_pd(c6_00,sh_vdw_invrcut6),_mm_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+            vvdw             = _mm_and_pd(vvdw,cutoff_mask);
+            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 82 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*82);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_sse2_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_sse2_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_pd(facel,_mm_load1_pd(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 62 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _mm_load_sd(charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 62 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*62);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_sse2_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_sse2_double.c
new file mode 100644 (file)
index 0000000..f156207
--- /dev/null
@@ -0,0 +1,1271 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse2_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse2_double.h"
+#include "kernelutil_x86_sse2_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_sse2_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_sse2_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_10;
+    __m128d           c6grid_20;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_pd();
+        vvdwsum          = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq00,_mm_sub_pd(_mm_sub_pd(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
+                              _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_add_pd(_mm_mul_pd(c6_00,sh_vdw_invrcut6),_mm_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+            vvdw             = _mm_and_pd(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq10,_mm_sub_pd(_mm_sub_pd(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq20,_mm_sub_pd(_mm_sub_pd(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 177 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _mm_load_sd(charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq00,_mm_sub_pd(_mm_sub_pd(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
+                              _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_add_pd(_mm_mul_pd(c6_00,sh_vdw_invrcut6),_mm_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+            vvdw             = _mm_and_pd(vvdw,cutoff_mask);
+            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq10,_mm_sub_pd(_mm_sub_pd(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq20,_mm_sub_pd(_mm_sub_pd(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 177 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*177);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_sse2_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_sse2_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_10;
+    __m128d           c6grid_20;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 143 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _mm_load_sd(charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 143 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*143);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_sse2_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_sse2_double.c
new file mode 100644 (file)
index 0000000..437ae4c
--- /dev/null
@@ -0,0 +1,2599 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse2_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse2_double.h"
+#include "kernelutil_x86_sse2_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_sse2_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_sse2_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_01;
+    __m128d           c6grid_02;
+    __m128d           c6grid_10;
+    __m128d           c6grid_11;
+    __m128d           c6grid_12;
+    __m128d           c6grid_20;
+    __m128d           c6grid_21;
+    __m128d           c6grid_22;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_pd(charge[inr+0]);
+    jq1              = _mm_set1_pd(charge[inr+1]);
+    jq2              = _mm_set1_pd(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_pd(iq0,jq0);
+    c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_pd(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq01             = _mm_mul_pd(iq0,jq1);
+    qq02             = _mm_mul_pd(iq0,jq2);
+    qq10             = _mm_mul_pd(iq1,jq0);
+    qq11             = _mm_mul_pd(iq1,jq1);
+    qq12             = _mm_mul_pd(iq1,jq2);
+    qq20             = _mm_mul_pd(iq2,jq0);
+    qq21             = _mm_mul_pd(iq2,jq1);
+    qq22             = _mm_mul_pd(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_pd();
+        vvdwsum          = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx01             = _mm_sub_pd(ix0,jx1);
+            dy01             = _mm_sub_pd(iy0,jy1);
+            dz01             = _mm_sub_pd(iz0,jz1);
+            dx02             = _mm_sub_pd(ix0,jx2);
+            dy02             = _mm_sub_pd(iy0,jy2);
+            dz02             = _mm_sub_pd(iz0,jz2);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv01           = gmx_mm_invsqrt_pd(rsq01);
+            rinv02           = gmx_mm_invsqrt_pd(rsq02);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq01         = _mm_mul_pd(rinv01,rinv01);
+            rinvsq02         = _mm_mul_pd(rinv02,rinv02);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq00,_mm_sub_pd(_mm_sub_pd(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
+                              _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_add_pd(_mm_mul_pd(c6_00,sh_vdw_invrcut6),_mm_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+            vvdw             = _mm_and_pd(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm_mul_pd(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r01,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq01,_mm_sub_pd(_mm_sub_pd(rinv01,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx01);
+            ty               = _mm_mul_pd(fscal,dy01);
+            tz               = _mm_mul_pd(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm_mul_pd(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r02,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq02,_mm_sub_pd(_mm_sub_pd(rinv02,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx02);
+            ty               = _mm_mul_pd(fscal,dy02);
+            tz               = _mm_mul_pd(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq10,_mm_sub_pd(_mm_sub_pd(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_sub_pd(rinv11,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx11);
+            ty               = _mm_mul_pd(fscal,dy11);
+            tz               = _mm_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_sub_pd(rinv12,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx12);
+            ty               = _mm_mul_pd(fscal,dy12);
+            tz               = _mm_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq20,_mm_sub_pd(_mm_sub_pd(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_sub_pd(rinv21,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx21);
+            ty               = _mm_mul_pd(fscal,dy21);
+            tz               = _mm_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_sub_pd(rinv22,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx22);
+            ty               = _mm_mul_pd(fscal,dy22);
+            tz               = _mm_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 450 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx01             = _mm_sub_pd(ix0,jx1);
+            dy01             = _mm_sub_pd(iy0,jy1);
+            dz01             = _mm_sub_pd(iz0,jz1);
+            dx02             = _mm_sub_pd(ix0,jx2);
+            dy02             = _mm_sub_pd(iy0,jy2);
+            dz02             = _mm_sub_pd(iz0,jz2);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv01           = gmx_mm_invsqrt_pd(rsq01);
+            rinv02           = gmx_mm_invsqrt_pd(rsq02);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq01         = _mm_mul_pd(rinv01,rinv01);
+            rinvsq02         = _mm_mul_pd(rinv02,rinv02);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq00,_mm_sub_pd(_mm_sub_pd(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
+                              _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_add_pd(_mm_mul_pd(c6_00,sh_vdw_invrcut6),_mm_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+            vvdw             = _mm_and_pd(vvdw,cutoff_mask);
+            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm_mul_pd(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r01,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq01,_mm_sub_pd(_mm_sub_pd(rinv01,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx01);
+            ty               = _mm_mul_pd(fscal,dy01);
+            tz               = _mm_mul_pd(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm_mul_pd(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r02,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq02,_mm_sub_pd(_mm_sub_pd(rinv02,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx02);
+            ty               = _mm_mul_pd(fscal,dy02);
+            tz               = _mm_mul_pd(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq10,_mm_sub_pd(_mm_sub_pd(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_sub_pd(rinv11,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx11);
+            ty               = _mm_mul_pd(fscal,dy11);
+            tz               = _mm_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_sub_pd(rinv12,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx12);
+            ty               = _mm_mul_pd(fscal,dy12);
+            tz               = _mm_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq20,_mm_sub_pd(_mm_sub_pd(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_sub_pd(rinv21,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx21);
+            ty               = _mm_mul_pd(fscal,dy21);
+            tz               = _mm_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_sub_pd(rinv22,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx22);
+            ty               = _mm_mul_pd(fscal,dy22);
+            tz               = _mm_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 450 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*450);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_sse2_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_sse2_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_01;
+    __m128d           c6grid_02;
+    __m128d           c6grid_10;
+    __m128d           c6grid_11;
+    __m128d           c6grid_12;
+    __m128d           c6grid_20;
+    __m128d           c6grid_21;
+    __m128d           c6grid_22;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_pd(charge[inr+0]);
+    jq1              = _mm_set1_pd(charge[inr+1]);
+    jq2              = _mm_set1_pd(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_pd(iq0,jq0);
+    c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_pd(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq01             = _mm_mul_pd(iq0,jq1);
+    qq02             = _mm_mul_pd(iq0,jq2);
+    qq10             = _mm_mul_pd(iq1,jq0);
+    qq11             = _mm_mul_pd(iq1,jq1);
+    qq12             = _mm_mul_pd(iq1,jq2);
+    qq20             = _mm_mul_pd(iq2,jq0);
+    qq21             = _mm_mul_pd(iq2,jq1);
+    qq22             = _mm_mul_pd(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx01             = _mm_sub_pd(ix0,jx1);
+            dy01             = _mm_sub_pd(iy0,jy1);
+            dz01             = _mm_sub_pd(iz0,jz1);
+            dx02             = _mm_sub_pd(ix0,jx2);
+            dy02             = _mm_sub_pd(iy0,jy2);
+            dz02             = _mm_sub_pd(iz0,jz2);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv01           = gmx_mm_invsqrt_pd(rsq01);
+            rinv02           = gmx_mm_invsqrt_pd(rsq02);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq01         = _mm_mul_pd(rinv01,rinv01);
+            rinvsq02         = _mm_mul_pd(rinv02,rinv02);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm_mul_pd(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r01,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx01);
+            ty               = _mm_mul_pd(fscal,dy01);
+            tz               = _mm_mul_pd(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm_mul_pd(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r02,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx02);
+            ty               = _mm_mul_pd(fscal,dy02);
+            tz               = _mm_mul_pd(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx11);
+            ty               = _mm_mul_pd(fscal,dy11);
+            tz               = _mm_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx12);
+            ty               = _mm_mul_pd(fscal,dy12);
+            tz               = _mm_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx21);
+            ty               = _mm_mul_pd(fscal,dy21);
+            tz               = _mm_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx22);
+            ty               = _mm_mul_pd(fscal,dy22);
+            tz               = _mm_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 374 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx01             = _mm_sub_pd(ix0,jx1);
+            dy01             = _mm_sub_pd(iy0,jy1);
+            dz01             = _mm_sub_pd(iz0,jz1);
+            dx02             = _mm_sub_pd(ix0,jx2);
+            dy02             = _mm_sub_pd(iy0,jy2);
+            dz02             = _mm_sub_pd(iz0,jz2);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv01           = gmx_mm_invsqrt_pd(rsq01);
+            rinv02           = gmx_mm_invsqrt_pd(rsq02);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq01         = _mm_mul_pd(rinv01,rinv01);
+            rinvsq02         = _mm_mul_pd(rinv02,rinv02);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm_mul_pd(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r01,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx01);
+            ty               = _mm_mul_pd(fscal,dy01);
+            tz               = _mm_mul_pd(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm_mul_pd(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r02,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx02);
+            ty               = _mm_mul_pd(fscal,dy02);
+            tz               = _mm_mul_pd(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx11);
+            ty               = _mm_mul_pd(fscal,dy11);
+            tz               = _mm_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx12);
+            ty               = _mm_mul_pd(fscal,dy12);
+            tz               = _mm_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx21);
+            ty               = _mm_mul_pd(fscal,dy21);
+            tz               = _mm_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx22);
+            ty               = _mm_mul_pd(fscal,dy22);
+            tz               = _mm_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 374 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*374);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_sse2_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_sse2_double.c
new file mode 100644 (file)
index 0000000..e8b0974
--- /dev/null
@@ -0,0 +1,1449 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse2_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse2_double.h"
+#include "kernelutil_x86_sse2_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_sse2_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_sse2_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128d          dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_10;
+    __m128d           c6grid_20;
+    __m128d           c6grid_30;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+        fix3             = _mm_setzero_pd();
+        fiy3             = _mm_setzero_pd();
+        fiz3             = _mm_setzero_pd();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_pd();
+        vvdwsum          = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx30             = _mm_sub_pd(ix3,jx0);
+            dy30             = _mm_sub_pd(iy3,jy0);
+            dz30             = _mm_sub_pd(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv30           = gmx_mm_invsqrt_pd(rsq30);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq30         = _mm_mul_pd(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
+                              _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_add_pd(_mm_mul_pd(c6_00,sh_vdw_invrcut6),_mm_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_pd(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq10,_mm_sub_pd(_mm_sub_pd(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq20,_mm_sub_pd(_mm_sub_pd(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm_mul_pd(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_pd(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r30,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq30,_mm_sub_pd(_mm_sub_pd(rinv30,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx30);
+            ty               = _mm_mul_pd(fscal,dy30);
+            tz               = _mm_mul_pd(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 203 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx30             = _mm_sub_pd(ix3,jx0);
+            dy30             = _mm_sub_pd(iy3,jy0);
+            dz30             = _mm_sub_pd(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv30           = gmx_mm_invsqrt_pd(rsq30);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq30         = _mm_mul_pd(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _mm_load_sd(charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
+                              _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_add_pd(_mm_mul_pd(c6_00,sh_vdw_invrcut6),_mm_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_pd(vvdw,cutoff_mask);
+            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq10,_mm_sub_pd(_mm_sub_pd(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq20,_mm_sub_pd(_mm_sub_pd(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm_mul_pd(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_pd(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r30,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq30,_mm_sub_pd(_mm_sub_pd(rinv30,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx30);
+            ty               = _mm_mul_pd(fscal,dy30);
+            tz               = _mm_mul_pd(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 203 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*203);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_sse2_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_sse2_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128d          dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_10;
+    __m128d           c6grid_20;
+    __m128d           c6grid_30;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+        fix3             = _mm_setzero_pd();
+        fiy3             = _mm_setzero_pd();
+        fiz3             = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx30             = _mm_sub_pd(ix3,jx0);
+            dy30             = _mm_sub_pd(iy3,jy0);
+            dz30             = _mm_sub_pd(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv30           = gmx_mm_invsqrt_pd(rsq30);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq30         = _mm_mul_pd(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm_mul_pd(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_pd(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r30,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx30);
+            ty               = _mm_mul_pd(fscal,dy30);
+            tz               = _mm_mul_pd(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 169 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx30             = _mm_sub_pd(ix3,jx0);
+            dy30             = _mm_sub_pd(iy3,jy0);
+            dz30             = _mm_sub_pd(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv30           = gmx_mm_invsqrt_pd(rsq30);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq30         = _mm_mul_pd(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _mm_load_sd(charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm_mul_pd(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_pd(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r30,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx30);
+            ty               = _mm_mul_pd(fscal,dy30);
+            tz               = _mm_mul_pd(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 169 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*169);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_sse2_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_sse2_double.c
new file mode 100644 (file)
index 0000000..a468859
--- /dev/null
@@ -0,0 +1,2789 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse2_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse2_double.h"
+#include "kernelutil_x86_sse2_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_sse2_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_sse2_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_11;
+    __m128d           c6grid_12;
+    __m128d           c6grid_13;
+    __m128d           c6grid_21;
+    __m128d           c6grid_22;
+    __m128d           c6grid_23;
+    __m128d           c6grid_31;
+    __m128d           c6grid_32;
+    __m128d           c6grid_33;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_pd(charge[inr+1]);
+    jq2              = _mm_set1_pd(charge[inr+2]);
+    jq3              = _mm_set1_pd(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_pd(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq11             = _mm_mul_pd(iq1,jq1);
+    qq12             = _mm_mul_pd(iq1,jq2);
+    qq13             = _mm_mul_pd(iq1,jq3);
+    qq21             = _mm_mul_pd(iq2,jq1);
+    qq22             = _mm_mul_pd(iq2,jq2);
+    qq23             = _mm_mul_pd(iq2,jq3);
+    qq31             = _mm_mul_pd(iq3,jq1);
+    qq32             = _mm_mul_pd(iq3,jq2);
+    qq33             = _mm_mul_pd(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+        fix3             = _mm_setzero_pd();
+        fiy3             = _mm_setzero_pd();
+        fiz3             = _mm_setzero_pd();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_pd();
+        vvdwsum          = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx13             = _mm_sub_pd(ix1,jx3);
+            dy13             = _mm_sub_pd(iy1,jy3);
+            dz13             = _mm_sub_pd(iz1,jz3);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+            dx23             = _mm_sub_pd(ix2,jx3);
+            dy23             = _mm_sub_pd(iy2,jy3);
+            dz23             = _mm_sub_pd(iz2,jz3);
+            dx31             = _mm_sub_pd(ix3,jx1);
+            dy31             = _mm_sub_pd(iy3,jy1);
+            dz31             = _mm_sub_pd(iz3,jz1);
+            dx32             = _mm_sub_pd(ix3,jx2);
+            dy32             = _mm_sub_pd(iy3,jy2);
+            dz32             = _mm_sub_pd(iz3,jz2);
+            dx33             = _mm_sub_pd(ix3,jx3);
+            dy33             = _mm_sub_pd(iy3,jy3);
+            dz33             = _mm_sub_pd(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv13           = gmx_mm_invsqrt_pd(rsq13);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+            rinv23           = gmx_mm_invsqrt_pd(rsq23);
+            rinv31           = gmx_mm_invsqrt_pd(rsq31);
+            rinv32           = gmx_mm_invsqrt_pd(rsq32);
+            rinv33           = gmx_mm_invsqrt_pd(rsq33);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq13         = _mm_mul_pd(rinv13,rinv13);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+            rinvsq23         = _mm_mul_pd(rinv23,rinv23);
+            rinvsq31         = _mm_mul_pd(rinv31,rinv31);
+            rinvsq32         = _mm_mul_pd(rinv32,rinv32);
+            rinvsq33         = _mm_mul_pd(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+            fjx3             = _mm_setzero_pd();
+            fjy3             = _mm_setzero_pd();
+            fjz3             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
+                              _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_add_pd(_mm_mul_pd(c6_00,sh_vdw_invrcut6),_mm_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_pd(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_sub_pd(rinv11,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx11);
+            ty               = _mm_mul_pd(fscal,dy11);
+            tz               = _mm_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_sub_pd(rinv12,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx12);
+            ty               = _mm_mul_pd(fscal,dy12);
+            tz               = _mm_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm_mul_pd(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r13,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq13,_mm_sub_pd(_mm_sub_pd(rinv13,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx13);
+            ty               = _mm_mul_pd(fscal,dy13);
+            tz               = _mm_mul_pd(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_sub_pd(rinv21,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx21);
+            ty               = _mm_mul_pd(fscal,dy21);
+            tz               = _mm_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_sub_pd(rinv22,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx22);
+            ty               = _mm_mul_pd(fscal,dy22);
+            tz               = _mm_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm_mul_pd(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r23,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq23,_mm_sub_pd(_mm_sub_pd(rinv23,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx23);
+            ty               = _mm_mul_pd(fscal,dy23);
+            tz               = _mm_mul_pd(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm_mul_pd(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r31,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq31,_mm_sub_pd(_mm_sub_pd(rinv31,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx31);
+            ty               = _mm_mul_pd(fscal,dy31);
+            tz               = _mm_mul_pd(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm_mul_pd(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r32,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq32,_mm_sub_pd(_mm_sub_pd(rinv32,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx32);
+            ty               = _mm_mul_pd(fscal,dy32);
+            tz               = _mm_mul_pd(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm_mul_pd(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r33,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq33,_mm_sub_pd(_mm_sub_pd(rinv33,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx33);
+            ty               = _mm_mul_pd(fscal,dy33);
+            tz               = _mm_mul_pd(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            }
+
+            gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 479 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx13             = _mm_sub_pd(ix1,jx3);
+            dy13             = _mm_sub_pd(iy1,jy3);
+            dz13             = _mm_sub_pd(iz1,jz3);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+            dx23             = _mm_sub_pd(ix2,jx3);
+            dy23             = _mm_sub_pd(iy2,jy3);
+            dz23             = _mm_sub_pd(iz2,jz3);
+            dx31             = _mm_sub_pd(ix3,jx1);
+            dy31             = _mm_sub_pd(iy3,jy1);
+            dz31             = _mm_sub_pd(iz3,jz1);
+            dx32             = _mm_sub_pd(ix3,jx2);
+            dy32             = _mm_sub_pd(iy3,jy2);
+            dz32             = _mm_sub_pd(iz3,jz2);
+            dx33             = _mm_sub_pd(ix3,jx3);
+            dy33             = _mm_sub_pd(iy3,jy3);
+            dz33             = _mm_sub_pd(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv13           = gmx_mm_invsqrt_pd(rsq13);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+            rinv23           = gmx_mm_invsqrt_pd(rsq23);
+            rinv31           = gmx_mm_invsqrt_pd(rsq31);
+            rinv32           = gmx_mm_invsqrt_pd(rsq32);
+            rinv33           = gmx_mm_invsqrt_pd(rsq33);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq13         = _mm_mul_pd(rinv13,rinv13);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+            rinvsq23         = _mm_mul_pd(rinv23,rinv23);
+            rinvsq31         = _mm_mul_pd(rinv31,rinv31);
+            rinvsq32         = _mm_mul_pd(rinv32,rinv32);
+            rinvsq33         = _mm_mul_pd(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+            fjx3             = _mm_setzero_pd();
+            fjy3             = _mm_setzero_pd();
+            fjz3             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
+                              _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_add_pd(_mm_mul_pd(c6_00,sh_vdw_invrcut6),_mm_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_pd(vvdw,cutoff_mask);
+            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_sub_pd(rinv11,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx11);
+            ty               = _mm_mul_pd(fscal,dy11);
+            tz               = _mm_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_sub_pd(rinv12,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx12);
+            ty               = _mm_mul_pd(fscal,dy12);
+            tz               = _mm_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm_mul_pd(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r13,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq13,_mm_sub_pd(_mm_sub_pd(rinv13,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx13);
+            ty               = _mm_mul_pd(fscal,dy13);
+            tz               = _mm_mul_pd(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_sub_pd(rinv21,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx21);
+            ty               = _mm_mul_pd(fscal,dy21);
+            tz               = _mm_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_sub_pd(rinv22,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx22);
+            ty               = _mm_mul_pd(fscal,dy22);
+            tz               = _mm_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm_mul_pd(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r23,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq23,_mm_sub_pd(_mm_sub_pd(rinv23,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx23);
+            ty               = _mm_mul_pd(fscal,dy23);
+            tz               = _mm_mul_pd(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm_mul_pd(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r31,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq31,_mm_sub_pd(_mm_sub_pd(rinv31,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx31);
+            ty               = _mm_mul_pd(fscal,dy31);
+            tz               = _mm_mul_pd(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm_mul_pd(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r32,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq32,_mm_sub_pd(_mm_sub_pd(rinv32,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx32);
+            ty               = _mm_mul_pd(fscal,dy32);
+            tz               = _mm_mul_pd(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm_mul_pd(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r33,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq33,_mm_sub_pd(_mm_sub_pd(rinv33,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx33);
+            ty               = _mm_mul_pd(fscal,dy33);
+            tz               = _mm_mul_pd(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            }
+
+            gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 479 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*479);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_sse2_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_sse2_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_11;
+    __m128d           c6grid_12;
+    __m128d           c6grid_13;
+    __m128d           c6grid_21;
+    __m128d           c6grid_22;
+    __m128d           c6grid_23;
+    __m128d           c6grid_31;
+    __m128d           c6grid_32;
+    __m128d           c6grid_33;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_pd(charge[inr+1]);
+    jq2              = _mm_set1_pd(charge[inr+2]);
+    jq3              = _mm_set1_pd(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_pd(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq11             = _mm_mul_pd(iq1,jq1);
+    qq12             = _mm_mul_pd(iq1,jq2);
+    qq13             = _mm_mul_pd(iq1,jq3);
+    qq21             = _mm_mul_pd(iq2,jq1);
+    qq22             = _mm_mul_pd(iq2,jq2);
+    qq23             = _mm_mul_pd(iq2,jq3);
+    qq31             = _mm_mul_pd(iq3,jq1);
+    qq32             = _mm_mul_pd(iq3,jq2);
+    qq33             = _mm_mul_pd(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+        fix3             = _mm_setzero_pd();
+        fiy3             = _mm_setzero_pd();
+        fiz3             = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx13             = _mm_sub_pd(ix1,jx3);
+            dy13             = _mm_sub_pd(iy1,jy3);
+            dz13             = _mm_sub_pd(iz1,jz3);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+            dx23             = _mm_sub_pd(ix2,jx3);
+            dy23             = _mm_sub_pd(iy2,jy3);
+            dz23             = _mm_sub_pd(iz2,jz3);
+            dx31             = _mm_sub_pd(ix3,jx1);
+            dy31             = _mm_sub_pd(iy3,jy1);
+            dz31             = _mm_sub_pd(iz3,jz1);
+            dx32             = _mm_sub_pd(ix3,jx2);
+            dy32             = _mm_sub_pd(iy3,jy2);
+            dz32             = _mm_sub_pd(iz3,jz2);
+            dx33             = _mm_sub_pd(ix3,jx3);
+            dy33             = _mm_sub_pd(iy3,jy3);
+            dz33             = _mm_sub_pd(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv13           = gmx_mm_invsqrt_pd(rsq13);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+            rinv23           = gmx_mm_invsqrt_pd(rsq23);
+            rinv31           = gmx_mm_invsqrt_pd(rsq31);
+            rinv32           = gmx_mm_invsqrt_pd(rsq32);
+            rinv33           = gmx_mm_invsqrt_pd(rsq33);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq13         = _mm_mul_pd(rinv13,rinv13);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+            rinvsq23         = _mm_mul_pd(rinv23,rinv23);
+            rinvsq31         = _mm_mul_pd(rinv31,rinv31);
+            rinvsq32         = _mm_mul_pd(rinv32,rinv32);
+            rinvsq33         = _mm_mul_pd(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+            fjx3             = _mm_setzero_pd();
+            fjy3             = _mm_setzero_pd();
+            fjz3             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx11);
+            ty               = _mm_mul_pd(fscal,dy11);
+            tz               = _mm_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx12);
+            ty               = _mm_mul_pd(fscal,dy12);
+            tz               = _mm_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm_mul_pd(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r13,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx13);
+            ty               = _mm_mul_pd(fscal,dy13);
+            tz               = _mm_mul_pd(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx21);
+            ty               = _mm_mul_pd(fscal,dy21);
+            tz               = _mm_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx22);
+            ty               = _mm_mul_pd(fscal,dy22);
+            tz               = _mm_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm_mul_pd(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r23,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx23);
+            ty               = _mm_mul_pd(fscal,dy23);
+            tz               = _mm_mul_pd(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm_mul_pd(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r31,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx31);
+            ty               = _mm_mul_pd(fscal,dy31);
+            tz               = _mm_mul_pd(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm_mul_pd(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r32,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx32);
+            ty               = _mm_mul_pd(fscal,dy32);
+            tz               = _mm_mul_pd(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm_mul_pd(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r33,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx33);
+            ty               = _mm_mul_pd(fscal,dy33);
+            tz               = _mm_mul_pd(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            }
+
+            gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 403 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx13             = _mm_sub_pd(ix1,jx3);
+            dy13             = _mm_sub_pd(iy1,jy3);
+            dz13             = _mm_sub_pd(iz1,jz3);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+            dx23             = _mm_sub_pd(ix2,jx3);
+            dy23             = _mm_sub_pd(iy2,jy3);
+            dz23             = _mm_sub_pd(iz2,jz3);
+            dx31             = _mm_sub_pd(ix3,jx1);
+            dy31             = _mm_sub_pd(iy3,jy1);
+            dz31             = _mm_sub_pd(iz3,jz1);
+            dx32             = _mm_sub_pd(ix3,jx2);
+            dy32             = _mm_sub_pd(iy3,jy2);
+            dz32             = _mm_sub_pd(iz3,jz2);
+            dx33             = _mm_sub_pd(ix3,jx3);
+            dy33             = _mm_sub_pd(iy3,jy3);
+            dz33             = _mm_sub_pd(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv13           = gmx_mm_invsqrt_pd(rsq13);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+            rinv23           = gmx_mm_invsqrt_pd(rsq23);
+            rinv31           = gmx_mm_invsqrt_pd(rsq31);
+            rinv32           = gmx_mm_invsqrt_pd(rsq32);
+            rinv33           = gmx_mm_invsqrt_pd(rsq33);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq13         = _mm_mul_pd(rinv13,rinv13);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+            rinvsq23         = _mm_mul_pd(rinv23,rinv23);
+            rinvsq31         = _mm_mul_pd(rinv31,rinv31);
+            rinvsq32         = _mm_mul_pd(rinv32,rinv32);
+            rinvsq33         = _mm_mul_pd(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+            fjx3             = _mm_setzero_pd();
+            fjy3             = _mm_setzero_pd();
+            fjz3             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx11);
+            ty               = _mm_mul_pd(fscal,dy11);
+            tz               = _mm_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx12);
+            ty               = _mm_mul_pd(fscal,dy12);
+            tz               = _mm_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm_mul_pd(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r13,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx13);
+            ty               = _mm_mul_pd(fscal,dy13);
+            tz               = _mm_mul_pd(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx21);
+            ty               = _mm_mul_pd(fscal,dy21);
+            tz               = _mm_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx22);
+            ty               = _mm_mul_pd(fscal,dy22);
+            tz               = _mm_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm_mul_pd(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r23,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx23);
+            ty               = _mm_mul_pd(fscal,dy23);
+            tz               = _mm_mul_pd(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm_mul_pd(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r31,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx31);
+            ty               = _mm_mul_pd(fscal,dy31);
+            tz               = _mm_mul_pd(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm_mul_pd(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r32,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx32);
+            ty               = _mm_mul_pd(fscal,dy32);
+            tz               = _mm_mul_pd(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm_mul_pd(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r33,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx33);
+            ty               = _mm_mul_pd(fscal,dy33);
+            tz               = _mm_mul_pd(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            }
+
+            gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 403 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*403);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEw_VdwLJEw_GeomP1P1_sse2_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEw_VdwLJEw_GeomP1P1_sse2_double.c
new file mode 100644 (file)
index 0000000..2f7caa6
--- /dev/null
@@ -0,0 +1,685 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse2_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse2_double.h"
+#include "kernelutil_x86_sse2_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_sse2_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_sse2_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_pd(facel,_mm_load1_pd(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_pd();
+        vvdwsum          = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_sub_pd(_mm_mul_pd(vvdw12,one_twelfth),_mm_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,tx,ty,tz);
+
+            /* Inner loop uses 69 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _mm_load_sd(charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_sub_pd(_mm_mul_pd(vvdw12,one_twelfth),_mm_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,tx,ty,tz);
+
+            /* Inner loop uses 69 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*69);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_sse2_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_sse2_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_pd(facel,_mm_load1_pd(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,tx,ty,tz);
+
+            /* Inner loop uses 59 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _mm_load_sd(charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,tx,ty,tz);
+
+            /* Inner loop uses 59 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*59);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEw_VdwLJEw_GeomW3P1_sse2_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEw_VdwLJEw_GeomW3P1_sse2_double.c
new file mode 100644 (file)
index 0000000..d49d69e
--- /dev/null
@@ -0,0 +1,1137 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse2_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse2_double.h"
+#include "kernelutil_x86_sse2_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_sse2_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_sse2_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_10;
+    __m128d           c6grid_20;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_pd();
+        vvdwsum          = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_sub_pd(_mm_mul_pd(vvdw12,one_twelfth),_mm_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 154 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _mm_load_sd(charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_sub_pd(_mm_mul_pd(vvdw12,one_twelfth),_mm_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 154 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*154);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_sse2_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_sse2_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_10;
+    __m128d           c6grid_20;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 134 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _mm_load_sd(charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 134 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*134);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEw_VdwLJEw_GeomW3W3_sse2_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEw_VdwLJEw_GeomW3W3_sse2_double.c
new file mode 100644 (file)
index 0000000..79c9f96
--- /dev/null
@@ -0,0 +1,2237 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse2_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse2_double.h"
+#include "kernelutil_x86_sse2_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_sse2_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_sse2_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_01;
+    __m128d           c6grid_02;
+    __m128d           c6grid_10;
+    __m128d           c6grid_11;
+    __m128d           c6grid_12;
+    __m128d           c6grid_20;
+    __m128d           c6grid_21;
+    __m128d           c6grid_22;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_pd(charge[inr+0]);
+    jq1              = _mm_set1_pd(charge[inr+1]);
+    jq2              = _mm_set1_pd(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_pd(iq0,jq0);
+    c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_pd(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq01             = _mm_mul_pd(iq0,jq1);
+    qq02             = _mm_mul_pd(iq0,jq2);
+    qq10             = _mm_mul_pd(iq1,jq0);
+    qq11             = _mm_mul_pd(iq1,jq1);
+    qq12             = _mm_mul_pd(iq1,jq2);
+    qq20             = _mm_mul_pd(iq2,jq0);
+    qq21             = _mm_mul_pd(iq2,jq1);
+    qq22             = _mm_mul_pd(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_pd();
+        vvdwsum          = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx01             = _mm_sub_pd(ix0,jx1);
+            dy01             = _mm_sub_pd(iy0,jy1);
+            dz01             = _mm_sub_pd(iz0,jz1);
+            dx02             = _mm_sub_pd(ix0,jx2);
+            dy02             = _mm_sub_pd(iy0,jy2);
+            dz02             = _mm_sub_pd(iz0,jz2);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv01           = gmx_mm_invsqrt_pd(rsq01);
+            rinv02           = gmx_mm_invsqrt_pd(rsq02);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq01         = _mm_mul_pd(rinv01,rinv01);
+            rinvsq02         = _mm_mul_pd(rinv02,rinv02);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_sub_pd(_mm_mul_pd(vvdw12,one_twelfth),_mm_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_pd(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r01,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx01);
+            ty               = _mm_mul_pd(fscal,dy01);
+            tz               = _mm_mul_pd(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_pd(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r02,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx02);
+            ty               = _mm_mul_pd(fscal,dy02);
+            tz               = _mm_mul_pd(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx11);
+            ty               = _mm_mul_pd(fscal,dy11);
+            tz               = _mm_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx12);
+            ty               = _mm_mul_pd(fscal,dy12);
+            tz               = _mm_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx21);
+            ty               = _mm_mul_pd(fscal,dy21);
+            tz               = _mm_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx22);
+            ty               = _mm_mul_pd(fscal,dy22);
+            tz               = _mm_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 397 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx01             = _mm_sub_pd(ix0,jx1);
+            dy01             = _mm_sub_pd(iy0,jy1);
+            dz01             = _mm_sub_pd(iz0,jz1);
+            dx02             = _mm_sub_pd(ix0,jx2);
+            dy02             = _mm_sub_pd(iy0,jy2);
+            dz02             = _mm_sub_pd(iz0,jz2);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv01           = gmx_mm_invsqrt_pd(rsq01);
+            rinv02           = gmx_mm_invsqrt_pd(rsq02);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq01         = _mm_mul_pd(rinv01,rinv01);
+            rinvsq02         = _mm_mul_pd(rinv02,rinv02);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_sub_pd(_mm_mul_pd(vvdw12,one_twelfth),_mm_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_pd(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r01,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx01);
+            ty               = _mm_mul_pd(fscal,dy01);
+            tz               = _mm_mul_pd(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_pd(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r02,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx02);
+            ty               = _mm_mul_pd(fscal,dy02);
+            tz               = _mm_mul_pd(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx11);
+            ty               = _mm_mul_pd(fscal,dy11);
+            tz               = _mm_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx12);
+            ty               = _mm_mul_pd(fscal,dy12);
+            tz               = _mm_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx21);
+            ty               = _mm_mul_pd(fscal,dy21);
+            tz               = _mm_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx22);
+            ty               = _mm_mul_pd(fscal,dy22);
+            tz               = _mm_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 397 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*397);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_sse2_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_sse2_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_01;
+    __m128d           c6grid_02;
+    __m128d           c6grid_10;
+    __m128d           c6grid_11;
+    __m128d           c6grid_12;
+    __m128d           c6grid_20;
+    __m128d           c6grid_21;
+    __m128d           c6grid_22;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_pd(charge[inr+0]);
+    jq1              = _mm_set1_pd(charge[inr+1]);
+    jq2              = _mm_set1_pd(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_pd(iq0,jq0);
+    c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_pd(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq01             = _mm_mul_pd(iq0,jq1);
+    qq02             = _mm_mul_pd(iq0,jq2);
+    qq10             = _mm_mul_pd(iq1,jq0);
+    qq11             = _mm_mul_pd(iq1,jq1);
+    qq12             = _mm_mul_pd(iq1,jq2);
+    qq20             = _mm_mul_pd(iq2,jq0);
+    qq21             = _mm_mul_pd(iq2,jq1);
+    qq22             = _mm_mul_pd(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx01             = _mm_sub_pd(ix0,jx1);
+            dy01             = _mm_sub_pd(iy0,jy1);
+            dz01             = _mm_sub_pd(iz0,jz1);
+            dx02             = _mm_sub_pd(ix0,jx2);
+            dy02             = _mm_sub_pd(iy0,jy2);
+            dz02             = _mm_sub_pd(iz0,jz2);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv01           = gmx_mm_invsqrt_pd(rsq01);
+            rinv02           = gmx_mm_invsqrt_pd(rsq02);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq01         = _mm_mul_pd(rinv01,rinv01);
+            rinvsq02         = _mm_mul_pd(rinv02,rinv02);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_pd(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r01,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx01);
+            ty               = _mm_mul_pd(fscal,dy01);
+            tz               = _mm_mul_pd(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_pd(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r02,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx02);
+            ty               = _mm_mul_pd(fscal,dy02);
+            tz               = _mm_mul_pd(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx11);
+            ty               = _mm_mul_pd(fscal,dy11);
+            tz               = _mm_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx12);
+            ty               = _mm_mul_pd(fscal,dy12);
+            tz               = _mm_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx21);
+            ty               = _mm_mul_pd(fscal,dy21);
+            tz               = _mm_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx22);
+            ty               = _mm_mul_pd(fscal,dy22);
+            tz               = _mm_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 347 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx01             = _mm_sub_pd(ix0,jx1);
+            dy01             = _mm_sub_pd(iy0,jy1);
+            dz01             = _mm_sub_pd(iz0,jz1);
+            dx02             = _mm_sub_pd(ix0,jx2);
+            dy02             = _mm_sub_pd(iy0,jy2);
+            dz02             = _mm_sub_pd(iz0,jz2);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv01           = gmx_mm_invsqrt_pd(rsq01);
+            rinv02           = gmx_mm_invsqrt_pd(rsq02);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq01         = _mm_mul_pd(rinv01,rinv01);
+            rinvsq02         = _mm_mul_pd(rinv02,rinv02);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_pd(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r01,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx01);
+            ty               = _mm_mul_pd(fscal,dy01);
+            tz               = _mm_mul_pd(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_pd(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r02,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx02);
+            ty               = _mm_mul_pd(fscal,dy02);
+            tz               = _mm_mul_pd(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx11);
+            ty               = _mm_mul_pd(fscal,dy11);
+            tz               = _mm_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx12);
+            ty               = _mm_mul_pd(fscal,dy12);
+            tz               = _mm_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx21);
+            ty               = _mm_mul_pd(fscal,dy21);
+            tz               = _mm_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx22);
+            ty               = _mm_mul_pd(fscal,dy22);
+            tz               = _mm_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 347 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*347);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEw_VdwLJEw_GeomW4P1_sse2_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEw_VdwLJEw_GeomW4P1_sse2_double.c
new file mode 100644 (file)
index 0000000..be827df
--- /dev/null
@@ -0,0 +1,1279 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse2_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse2_double.h"
+#include "kernelutil_x86_sse2_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_sse2_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_sse2_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128d          dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_10;
+    __m128d           c6grid_20;
+    __m128d           c6grid_30;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+        fix3             = _mm_setzero_pd();
+        fiy3             = _mm_setzero_pd();
+        fiz3             = _mm_setzero_pd();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_pd();
+        vvdwsum          = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx30             = _mm_sub_pd(ix3,jx0);
+            dy30             = _mm_sub_pd(iy3,jy0);
+            dz30             = _mm_sub_pd(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv30           = gmx_mm_invsqrt_pd(rsq30);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq30         = _mm_mul_pd(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_sub_pd(_mm_mul_pd(vvdw12,one_twelfth),_mm_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_pd(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_pd(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r30,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq30,_mm_sub_pd(rinv30,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx30);
+            ty               = _mm_mul_pd(fscal,dy30);
+            tz               = _mm_mul_pd(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 177 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx30             = _mm_sub_pd(ix3,jx0);
+            dy30             = _mm_sub_pd(iy3,jy0);
+            dz30             = _mm_sub_pd(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv30           = gmx_mm_invsqrt_pd(rsq30);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq30         = _mm_mul_pd(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _mm_load_sd(charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_sub_pd(_mm_mul_pd(vvdw12,one_twelfth),_mm_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_pd(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_pd(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r30,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq30,_mm_sub_pd(rinv30,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx30);
+            ty               = _mm_mul_pd(fscal,dy30);
+            tz               = _mm_mul_pd(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 177 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*177);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_sse2_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_sse2_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128d          dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_10;
+    __m128d           c6grid_20;
+    __m128d           c6grid_30;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+        fix3             = _mm_setzero_pd();
+        fiy3             = _mm_setzero_pd();
+        fiz3             = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx30             = _mm_sub_pd(ix3,jx0);
+            dy30             = _mm_sub_pd(iy3,jy0);
+            dz30             = _mm_sub_pd(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv30           = gmx_mm_invsqrt_pd(rsq30);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq30         = _mm_mul_pd(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_pd(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_pd(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r30,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx30);
+            ty               = _mm_mul_pd(fscal,dy30);
+            tz               = _mm_mul_pd(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 157 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx30             = _mm_sub_pd(ix3,jx0);
+            dy30             = _mm_sub_pd(iy3,jy0);
+            dz30             = _mm_sub_pd(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv30           = gmx_mm_invsqrt_pd(rsq30);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq30         = _mm_mul_pd(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _mm_load_sd(charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_pd(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_pd(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r30,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx30);
+            ty               = _mm_mul_pd(fscal,dy30);
+            tz               = _mm_mul_pd(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 157 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*157);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEw_VdwLJEw_GeomW4W4_sse2_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEw_VdwLJEw_GeomW4W4_sse2_double.c
new file mode 100644 (file)
index 0000000..fae7c28
--- /dev/null
@@ -0,0 +1,2391 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse2_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse2_double.h"
+#include "kernelutil_x86_sse2_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_sse2_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_sse2_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_11;
+    __m128d           c6grid_12;
+    __m128d           c6grid_13;
+    __m128d           c6grid_21;
+    __m128d           c6grid_22;
+    __m128d           c6grid_23;
+    __m128d           c6grid_31;
+    __m128d           c6grid_32;
+    __m128d           c6grid_33;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_pd(charge[inr+1]);
+    jq2              = _mm_set1_pd(charge[inr+2]);
+    jq3              = _mm_set1_pd(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_pd(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq11             = _mm_mul_pd(iq1,jq1);
+    qq12             = _mm_mul_pd(iq1,jq2);
+    qq13             = _mm_mul_pd(iq1,jq3);
+    qq21             = _mm_mul_pd(iq2,jq1);
+    qq22             = _mm_mul_pd(iq2,jq2);
+    qq23             = _mm_mul_pd(iq2,jq3);
+    qq31             = _mm_mul_pd(iq3,jq1);
+    qq32             = _mm_mul_pd(iq3,jq2);
+    qq33             = _mm_mul_pd(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+        fix3             = _mm_setzero_pd();
+        fiy3             = _mm_setzero_pd();
+        fiz3             = _mm_setzero_pd();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_pd();
+        vvdwsum          = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx13             = _mm_sub_pd(ix1,jx3);
+            dy13             = _mm_sub_pd(iy1,jy3);
+            dz13             = _mm_sub_pd(iz1,jz3);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+            dx23             = _mm_sub_pd(ix2,jx3);
+            dy23             = _mm_sub_pd(iy2,jy3);
+            dz23             = _mm_sub_pd(iz2,jz3);
+            dx31             = _mm_sub_pd(ix3,jx1);
+            dy31             = _mm_sub_pd(iy3,jy1);
+            dz31             = _mm_sub_pd(iz3,jz1);
+            dx32             = _mm_sub_pd(ix3,jx2);
+            dy32             = _mm_sub_pd(iy3,jy2);
+            dz32             = _mm_sub_pd(iz3,jz2);
+            dx33             = _mm_sub_pd(ix3,jx3);
+            dy33             = _mm_sub_pd(iy3,jy3);
+            dz33             = _mm_sub_pd(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv13           = gmx_mm_invsqrt_pd(rsq13);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+            rinv23           = gmx_mm_invsqrt_pd(rsq23);
+            rinv31           = gmx_mm_invsqrt_pd(rsq31);
+            rinv32           = gmx_mm_invsqrt_pd(rsq32);
+            rinv33           = gmx_mm_invsqrt_pd(rsq33);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq13         = _mm_mul_pd(rinv13,rinv13);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+            rinvsq23         = _mm_mul_pd(rinv23,rinv23);
+            rinvsq31         = _mm_mul_pd(rinv31,rinv31);
+            rinvsq32         = _mm_mul_pd(rinv32,rinv32);
+            rinvsq33         = _mm_mul_pd(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+            fjx3             = _mm_setzero_pd();
+            fjy3             = _mm_setzero_pd();
+            fjz3             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_sub_pd(_mm_mul_pd(vvdw12,one_twelfth),_mm_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx11);
+            ty               = _mm_mul_pd(fscal,dy11);
+            tz               = _mm_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx12);
+            ty               = _mm_mul_pd(fscal,dy12);
+            tz               = _mm_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_pd(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r13,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx13);
+            ty               = _mm_mul_pd(fscal,dy13);
+            tz               = _mm_mul_pd(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx21);
+            ty               = _mm_mul_pd(fscal,dy21);
+            tz               = _mm_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx22);
+            ty               = _mm_mul_pd(fscal,dy22);
+            tz               = _mm_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_pd(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r23,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx23);
+            ty               = _mm_mul_pd(fscal,dy23);
+            tz               = _mm_mul_pd(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_pd(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r31,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx31);
+            ty               = _mm_mul_pd(fscal,dy31);
+            tz               = _mm_mul_pd(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_pd(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r32,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx32);
+            ty               = _mm_mul_pd(fscal,dy32);
+            tz               = _mm_mul_pd(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_pd(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r33,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx33);
+            ty               = _mm_mul_pd(fscal,dy33);
+            tz               = _mm_mul_pd(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 423 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx13             = _mm_sub_pd(ix1,jx3);
+            dy13             = _mm_sub_pd(iy1,jy3);
+            dz13             = _mm_sub_pd(iz1,jz3);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+            dx23             = _mm_sub_pd(ix2,jx3);
+            dy23             = _mm_sub_pd(iy2,jy3);
+            dz23             = _mm_sub_pd(iz2,jz3);
+            dx31             = _mm_sub_pd(ix3,jx1);
+            dy31             = _mm_sub_pd(iy3,jy1);
+            dz31             = _mm_sub_pd(iz3,jz1);
+            dx32             = _mm_sub_pd(ix3,jx2);
+            dy32             = _mm_sub_pd(iy3,jy2);
+            dz32             = _mm_sub_pd(iz3,jz2);
+            dx33             = _mm_sub_pd(ix3,jx3);
+            dy33             = _mm_sub_pd(iy3,jy3);
+            dz33             = _mm_sub_pd(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv13           = gmx_mm_invsqrt_pd(rsq13);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+            rinv23           = gmx_mm_invsqrt_pd(rsq23);
+            rinv31           = gmx_mm_invsqrt_pd(rsq31);
+            rinv32           = gmx_mm_invsqrt_pd(rsq32);
+            rinv33           = gmx_mm_invsqrt_pd(rsq33);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq13         = _mm_mul_pd(rinv13,rinv13);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+            rinvsq23         = _mm_mul_pd(rinv23,rinv23);
+            rinvsq31         = _mm_mul_pd(rinv31,rinv31);
+            rinvsq32         = _mm_mul_pd(rinv32,rinv32);
+            rinvsq33         = _mm_mul_pd(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+            fjx3             = _mm_setzero_pd();
+            fjy3             = _mm_setzero_pd();
+            fjz3             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_sub_pd(_mm_mul_pd(vvdw12,one_twelfth),_mm_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx11);
+            ty               = _mm_mul_pd(fscal,dy11);
+            tz               = _mm_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx12);
+            ty               = _mm_mul_pd(fscal,dy12);
+            tz               = _mm_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_pd(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r13,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx13);
+            ty               = _mm_mul_pd(fscal,dy13);
+            tz               = _mm_mul_pd(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx21);
+            ty               = _mm_mul_pd(fscal,dy21);
+            tz               = _mm_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx22);
+            ty               = _mm_mul_pd(fscal,dy22);
+            tz               = _mm_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_pd(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r23,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx23);
+            ty               = _mm_mul_pd(fscal,dy23);
+            tz               = _mm_mul_pd(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_pd(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r31,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx31);
+            ty               = _mm_mul_pd(fscal,dy31);
+            tz               = _mm_mul_pd(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_pd(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r32,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx32);
+            ty               = _mm_mul_pd(fscal,dy32);
+            tz               = _mm_mul_pd(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_pd(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r33,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx33);
+            ty               = _mm_mul_pd(fscal,dy33);
+            tz               = _mm_mul_pd(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 423 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*423);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_sse2_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_sse2_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_11;
+    __m128d           c6grid_12;
+    __m128d           c6grid_13;
+    __m128d           c6grid_21;
+    __m128d           c6grid_22;
+    __m128d           c6grid_23;
+    __m128d           c6grid_31;
+    __m128d           c6grid_32;
+    __m128d           c6grid_33;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_pd(charge[inr+1]);
+    jq2              = _mm_set1_pd(charge[inr+2]);
+    jq3              = _mm_set1_pd(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_pd(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq11             = _mm_mul_pd(iq1,jq1);
+    qq12             = _mm_mul_pd(iq1,jq2);
+    qq13             = _mm_mul_pd(iq1,jq3);
+    qq21             = _mm_mul_pd(iq2,jq1);
+    qq22             = _mm_mul_pd(iq2,jq2);
+    qq23             = _mm_mul_pd(iq2,jq3);
+    qq31             = _mm_mul_pd(iq3,jq1);
+    qq32             = _mm_mul_pd(iq3,jq2);
+    qq33             = _mm_mul_pd(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+        fix3             = _mm_setzero_pd();
+        fiy3             = _mm_setzero_pd();
+        fiz3             = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx13             = _mm_sub_pd(ix1,jx3);
+            dy13             = _mm_sub_pd(iy1,jy3);
+            dz13             = _mm_sub_pd(iz1,jz3);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+            dx23             = _mm_sub_pd(ix2,jx3);
+            dy23             = _mm_sub_pd(iy2,jy3);
+            dz23             = _mm_sub_pd(iz2,jz3);
+            dx31             = _mm_sub_pd(ix3,jx1);
+            dy31             = _mm_sub_pd(iy3,jy1);
+            dz31             = _mm_sub_pd(iz3,jz1);
+            dx32             = _mm_sub_pd(ix3,jx2);
+            dy32             = _mm_sub_pd(iy3,jy2);
+            dz32             = _mm_sub_pd(iz3,jz2);
+            dx33             = _mm_sub_pd(ix3,jx3);
+            dy33             = _mm_sub_pd(iy3,jy3);
+            dz33             = _mm_sub_pd(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv13           = gmx_mm_invsqrt_pd(rsq13);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+            rinv23           = gmx_mm_invsqrt_pd(rsq23);
+            rinv31           = gmx_mm_invsqrt_pd(rsq31);
+            rinv32           = gmx_mm_invsqrt_pd(rsq32);
+            rinv33           = gmx_mm_invsqrt_pd(rsq33);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq13         = _mm_mul_pd(rinv13,rinv13);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+            rinvsq23         = _mm_mul_pd(rinv23,rinv23);
+            rinvsq31         = _mm_mul_pd(rinv31,rinv31);
+            rinvsq32         = _mm_mul_pd(rinv32,rinv32);
+            rinvsq33         = _mm_mul_pd(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+            fjx3             = _mm_setzero_pd();
+            fjy3             = _mm_setzero_pd();
+            fjz3             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx11);
+            ty               = _mm_mul_pd(fscal,dy11);
+            tz               = _mm_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx12);
+            ty               = _mm_mul_pd(fscal,dy12);
+            tz               = _mm_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_pd(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r13,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx13);
+            ty               = _mm_mul_pd(fscal,dy13);
+            tz               = _mm_mul_pd(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx21);
+            ty               = _mm_mul_pd(fscal,dy21);
+            tz               = _mm_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx22);
+            ty               = _mm_mul_pd(fscal,dy22);
+            tz               = _mm_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_pd(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r23,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx23);
+            ty               = _mm_mul_pd(fscal,dy23);
+            tz               = _mm_mul_pd(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_pd(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r31,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx31);
+            ty               = _mm_mul_pd(fscal,dy31);
+            tz               = _mm_mul_pd(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_pd(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r32,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx32);
+            ty               = _mm_mul_pd(fscal,dy32);
+            tz               = _mm_mul_pd(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_pd(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r33,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx33);
+            ty               = _mm_mul_pd(fscal,dy33);
+            tz               = _mm_mul_pd(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 373 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx13             = _mm_sub_pd(ix1,jx3);
+            dy13             = _mm_sub_pd(iy1,jy3);
+            dz13             = _mm_sub_pd(iz1,jz3);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+            dx23             = _mm_sub_pd(ix2,jx3);
+            dy23             = _mm_sub_pd(iy2,jy3);
+            dz23             = _mm_sub_pd(iz2,jz3);
+            dx31             = _mm_sub_pd(ix3,jx1);
+            dy31             = _mm_sub_pd(iy3,jy1);
+            dz31             = _mm_sub_pd(iz3,jz1);
+            dx32             = _mm_sub_pd(ix3,jx2);
+            dy32             = _mm_sub_pd(iy3,jy2);
+            dz32             = _mm_sub_pd(iz3,jz2);
+            dx33             = _mm_sub_pd(ix3,jx3);
+            dy33             = _mm_sub_pd(iy3,jy3);
+            dz33             = _mm_sub_pd(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv13           = gmx_mm_invsqrt_pd(rsq13);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+            rinv23           = gmx_mm_invsqrt_pd(rsq23);
+            rinv31           = gmx_mm_invsqrt_pd(rsq31);
+            rinv32           = gmx_mm_invsqrt_pd(rsq32);
+            rinv33           = gmx_mm_invsqrt_pd(rsq33);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq13         = _mm_mul_pd(rinv13,rinv13);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+            rinvsq23         = _mm_mul_pd(rinv23,rinv23);
+            rinvsq31         = _mm_mul_pd(rinv31,rinv31);
+            rinvsq32         = _mm_mul_pd(rinv32,rinv32);
+            rinvsq33         = _mm_mul_pd(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+            fjx3             = _mm_setzero_pd();
+            fjy3             = _mm_setzero_pd();
+            fjz3             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx11);
+            ty               = _mm_mul_pd(fscal,dy11);
+            tz               = _mm_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx12);
+            ty               = _mm_mul_pd(fscal,dy12);
+            tz               = _mm_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_pd(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r13,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx13);
+            ty               = _mm_mul_pd(fscal,dy13);
+            tz               = _mm_mul_pd(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx21);
+            ty               = _mm_mul_pd(fscal,dy21);
+            tz               = _mm_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx22);
+            ty               = _mm_mul_pd(fscal,dy22);
+            tz               = _mm_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_pd(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r23,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx23);
+            ty               = _mm_mul_pd(fscal,dy23);
+            tz               = _mm_mul_pd(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_pd(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r31,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx31);
+            ty               = _mm_mul_pd(fscal,dy31);
+            tz               = _mm_mul_pd(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_pd(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r32,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx32);
+            ty               = _mm_mul_pd(fscal,dy32);
+            tz               = _mm_mul_pd(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_pd(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r33,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx33);
+            ty               = _mm_mul_pd(fscal,dy33);
+            tz               = _mm_mul_pd(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 373 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*373);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_sse2_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_sse2_double.c
new file mode 100644 (file)
index 0000000..df89edd
--- /dev/null
@@ -0,0 +1,643 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse2_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse2_double.h"
+#include "kernelutil_x86_sse2_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_sse2_double
+ * Electrostatics interaction: None
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_sse2_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    rcutoff_scalar   = fr->rvdw;
+    rcutoff          = _mm_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        vvdwsum          = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
+                              _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_add_pd(_mm_mul_pd(c6_00,sh_vdw_invrcut6),_mm_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_pd(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 62 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
+                              _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_add_pd(_mm_mul_pd(c6_00,sh_vdw_invrcut6),_mm_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_pd(vvdw,cutoff_mask);
+            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 62 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*62);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_sse2_double
+ * Electrostatics interaction: None
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_sse2_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    rcutoff_scalar   = fr->rvdw;
+    rcutoff          = _mm_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 49 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 49 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 6 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*6 + inneriter*49);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecNone_VdwLJEw_GeomP1P1_sse2_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecNone_VdwLJEw_GeomP1P1_sse2_double.c
new file mode 100644 (file)
index 0000000..1719de5
--- /dev/null
@@ -0,0 +1,589 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse2_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse2_double.h"
+#include "kernelutil_x86_sse2_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_sse2_double
+ * Electrostatics interaction: None
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_sse2_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        vvdwsum          = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_sub_pd(_mm_mul_pd(vvdw12,one_twelfth),_mm_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,tx,ty,tz);
+
+            /* Inner loop uses 51 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_sub_pd(_mm_mul_pd(vvdw12,one_twelfth),_mm_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,tx,ty,tz);
+
+            /* Inner loop uses 51 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*51);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_sse2_double
+ * Electrostatics interaction: None
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_sse2_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,tx,ty,tz);
+
+            /* Inner loop uses 46 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,tx,ty,tz);
+
+            /* Inner loop uses 46 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 6 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*6 + inneriter*46);
+}
index d7952f2f9cb9c1732fb470cab2ea30077f8a38eb..b997aea7ed2b92ce2d183538c4cfcb4fe816fe2a 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
 
 #include "../nb_kernel.h"
 
+nb_kernel_t nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_sse2_double;
+nb_kernel_t nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_sse2_double;
+nb_kernel_t nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_sse2_double;
+nb_kernel_t nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_sse2_double;
 nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse2_double;
 nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse2_double;
 nb_kernel_t nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sse2_double;
@@ -48,6 +52,16 @@ nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sse2_double;
 nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sse2_double;
 nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sse2_double;
 nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sse2_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_sse2_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_sse2_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_sse2_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_sse2_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_sse2_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_sse2_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_sse2_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_sse2_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_sse2_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_sse2_double;
 nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sse2_double;
 nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sse2_double;
 nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse2_double;
@@ -78,6 +92,16 @@ nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse2_double;
 nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse2_double;
 nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse2_double;
 nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse2_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_sse2_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_sse2_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_sse2_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_sse2_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_sse2_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_sse2_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_sse2_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_sse2_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_sse2_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_sse2_double;
 nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sse2_double;
 nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sse2_double;
 nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse2_double;
@@ -259,6 +283,10 @@ nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sse2_double;
 nb_kernel_info_t
     kernellist_sse2_double[] =
 {
+    { nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_sse2_double, "nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_sse2_double", "sse2_double", "None", "None", "LJEwald", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_sse2_double, "nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_sse2_double", "sse2_double", "None", "None", "LJEwald", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_sse2_double, "nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_sse2_double", "sse2_double", "None", "None", "LJEwald", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_sse2_double, "nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_sse2_double", "sse2_double", "None", "None", "LJEwald", "PotentialShift", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse2_double, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse2_double", "sse2_double", "None", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse2_double, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse2_double", "sse2_double", "None", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sse2_double, "nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sse2_double", "sse2_double", "None", "None", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
@@ -267,6 +295,16 @@ nb_kernel_info_t
     { nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sse2_double, "nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sse2_double", "sse2_double", "None", "None", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sse2_double, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sse2_double", "sse2_double", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sse2_double, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sse2_double", "sse2_double", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_sse2_double, "nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_sse2_double", "sse2_double", "Ewald", "None", "LJEwald", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_sse2_double, "nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_sse2_double", "sse2_double", "Ewald", "None", "LJEwald", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_sse2_double, "nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_sse2_double", "sse2_double", "Ewald", "None", "LJEwald", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_sse2_double, "nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_sse2_double", "sse2_double", "Ewald", "None", "LJEwald", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_sse2_double, "nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_sse2_double", "sse2_double", "Ewald", "None", "LJEwald", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_sse2_double, "nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_sse2_double", "sse2_double", "Ewald", "None", "LJEwald", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_sse2_double, "nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_sse2_double", "sse2_double", "Ewald", "None", "LJEwald", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_sse2_double, "nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_sse2_double", "sse2_double", "Ewald", "None", "LJEwald", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_sse2_double, "nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_sse2_double", "sse2_double", "Ewald", "None", "LJEwald", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_sse2_double, "nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_sse2_double", "sse2_double", "Ewald", "None", "LJEwald", "None", "Water4Water4", "", "Force" },
     { nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sse2_double, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sse2_double", "sse2_double", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sse2_double, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sse2_double", "sse2_double", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse2_double, "nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse2_double", "sse2_double", "Ewald", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
@@ -297,6 +335,16 @@ nb_kernel_info_t
     { nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse2_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse2_double", "sse2_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
     { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse2_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse2_double", "sse2_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
     { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse2_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse2_double", "sse2_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_sse2_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_sse2_double", "sse2_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_sse2_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_sse2_double", "sse2_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_sse2_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_sse2_double", "sse2_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_sse2_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_sse2_double", "sse2_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_sse2_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_sse2_double", "sse2_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_sse2_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_sse2_double", "sse2_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_sse2_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_sse2_double", "sse2_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_sse2_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_sse2_double", "sse2_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_sse2_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_sse2_double", "sse2_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_sse2_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_sse2_double", "sse2_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water4Water4", "", "Force" },
     { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sse2_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sse2_double", "sse2_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sse2_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sse2_double", "sse2_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse2_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse2_double", "sse2_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
index 928844b5d6b7a703e9f91057e653e539a0a4f069..955b06d748bd60fa1a04481949a3d9bb66ca75ae 100644 (file)
@@ -2,7 +2,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -151,6 +151,15 @@ void
     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
     real             *vftab;
     /* #endif */
+    /* #if 'LJEwald' in KERNEL_VDW */
+    /* #for I,J in PAIRS_IJ */
+    __m128d           c6grid_{I}{J};
+    /* #endfor */
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    /* #endif */
     /* #if 'Ewald' in KERNEL_ELEC */
     __m128i          ewitab;
     __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
@@ -189,6 +198,12 @@ void
     vdwparam         = fr->nbfp;
     vdwtype          = mdatoms->typeA;
     /* #endif */
+    /* #if 'LJEwald' in KERNEL_VDW */
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+    /* #endif */
 
     /* #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW */
     vftab            = kernel_data->table_elec_vdw->data;
@@ -245,8 +260,14 @@ void
     qq{I}{J}             = _mm_mul_pd(iq{I},jq{J});
     /*         #endif */
     /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
+    /*             #if 'LJEwald' in KERNEL_VDW */
     c6_{I}{J}            = _mm_set1_pd(vdwparam[vdwioffset{I}+vdwjidx{J}A]);
     c12_{I}{J}           = _mm_set1_pd(vdwparam[vdwioffset{I}+vdwjidx{J}A+1]);
+    c6grid_{I}{J}        = _mm_set1_pd(vdwgridparam[vdwioffset{I}+vdwjidx{J}A]);
+    /*             #else */
+    c6_{I}{J}            = _mm_set1_pd(vdwparam[vdwioffset{I}+vdwjidx{J}A]);
+    c12_{I}{J}           = _mm_set1_pd(vdwparam[vdwioffset{I}+vdwjidx{J}A+1]);
+    /*             #endif */
     /*         #endif */
     /*     #endfor */
     /* #endif */
@@ -528,8 +549,17 @@ void
             /*             #if ROUND == 'Loop' */
             gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset{I}+vdwjidx{J}A,
                                          vdwparam+vdwioffset{I}+vdwjidx{J}B,&c6_{I}{J},&c12_{I}{J});
+
+            /*                 #if 'LJEwald' in KERNEL_VDW */
+            c6grid_{I}{J}       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset{I}+vdwjidx{J}A,
+                                                               vdwgridparam+vdwioffset{I}+vdwjidx{J}B);
+            /*                 #endif */
             /*             #else */
             gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset{I}+vdwjidx{J}A,&c6_{I}{J},&c12_{I}{J});
+
+            /*                 #if 'LJEwald' in KERNEL_VDW */
+            c6grid_{I}{J}       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset{I}+vdwjidx{J}A);
+            /*                 #endif */
             /*             #endif */
             /*         #endif */
             /*     #endif */
@@ -823,6 +853,45 @@ void
             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv{I}{J})));
             /*                 #define INNERFLOPS INNERFLOPS+4 */
             /*             #endif */
+
+           /*         #elif KERNEL_VDW=='LJEwald' */
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq{I}{J},rinvsq{I}{J}),rinvsq{I}{J});
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq{I}{J});
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /*                 #define INNERFLOPS INNERFLOPS+11 */
+            /*             #if 'Potential' in KERNEL_VF or KERNEL_MOD_VDW=='PotentialSwitch'*/
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_{I}{J},_mm_mul_pd(c6grid_{I}{J},_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_{I}{J},_mm_mul_pd(rinvsix,rinvsix));
+           /*                 #define INNERFLOPS INNERFLOPS+6 */
+            /*                 #if KERNEL_MOD_VDW=='PotentialShift' */
+            vvdw             = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_{I}{J},_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
+                              _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_add_pd(_mm_mul_pd(c6_{I}{J},sh_vdw_invrcut6),_mm_mul_pd(c6grid_{I}{J},sh_lj_ewald))),one_sixth));
+            /*                 #define INNERFLOPS INNERFLOPS+10 */
+            /*                 #else */
+            vvdw             = _mm_sub_pd(_mm_mul_pd(vvdw12,one_twelfth),_mm_mul_pd(vvdw6,one_sixth));
+            /*                 #define INNERFLOPS INNERFLOPS+3 */
+           /*                 #endif */
+            /*                  ## Check for force inside potential check, i.e. this means we already did the potential part */
+            /*                  #if 'Force' in KERNEL_VF */
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_{I}{J},one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq{I}{J});
+            /*                 #define INNERFLOPS INNERFLOPS+6 */
+            /*                  #endif */
+            /*              #elif KERNEL_VF=='Force' */
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_{I}{J},_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_{I}{J},one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_{I}{J},rinvsix),_mm_sub_pd(c6_{I}{J},f6A)),rinvsix),f6B),rinvsq{I}{J});
+            /*                 #define INNERFLOPS INNERFLOPS+11 */
+            /*              #endif */
             /*         #endif */
             /*         ## End of check for vdw interaction forms */
             /*     #endif */
index 3af643b6d51e8d9164aeaa6feff04beb8f195767..a323ea69019e19861f9ad9d716fc062498c9f4d1 100755 (executable)
@@ -2,7 +2,7 @@
 #
 # This file is part of the GROMACS molecular simulation package.
 #
-# Copyright (c) 2012,2013, by the GROMACS development team, led by
+# Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
 # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 # and including many others, as listed in the AUTHORS file in the
 # top-level source directory and at http://www.gromacs.org.
@@ -120,6 +120,7 @@ VdwList = {
     'LennardJones'            : ['rinvsq'],
 #    'Buckingham'              : ['rinv','rinvsq','r'], # Disabled for sse2 to reduce number of kernels and simply the template
     'CubicSplineTable'        : ['rinv','r','table'],
+    'LJEwald'                 : ['rinv','rinvsq','r'],
 }
 
 
@@ -193,6 +194,7 @@ Abbreviation = {
     'CubicSplineTable'        : 'CSTab',
     'LennardJones'            : 'LJ',
     'Buckingham'              : 'Bham',
+    'LJEwald'                 : 'LJEw',
     'PotentialShift'          : 'Sh',
     'PotentialSwitch'         : 'Sw',
     'ExactCutoff'             : 'Cut',
@@ -282,7 +284,11 @@ def KeepKernel(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,Kernel
         return 0
     # For Vdw, we support switch and shift for Lennard-Jones/Buckingham
     if((KernelVdwMod=='ExactCutoff') or
-       (KernelVdwMod in ['PotentialShift','PotentialSwitch'] and KernelVdw not in ['LennardJones','Buckingham'])):
+       (KernelVdwMod in ['PotentialShift','PotentialSwitch'] and KernelVdw not in ['LennardJones','Buckingham','LJEwald'])):
+        return 0
+
+    # For LJEwald, we only support shift
+    if(KernelVdw=='LJEwald' and KernelVdwMod=='PotentialSwitch'):
         return 0
 
     # Choose either switch or shift and don't mix them...
@@ -302,6 +308,10 @@ def KeepKernel(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,Kernel
         elif(KernelElec!='ReactionField'):
             return 0
 
+    #Only do LJ-PME if we are also doing PME for electrostatics, or no electrostatics at all.
+    if(KernelVdw=='LJEwald' and KernelElec not in ['Ewald','None']):
+        return 0
+
     return 1
 
 
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_sse2_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_sse2_single.c
new file mode 100644 (file)
index 0000000..c57c217
--- /dev/null
@@ -0,0 +1,860 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse2_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_sse2_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }  
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+        
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_add_ps(_mm_mul_ps(c6_00,sh_vdw_invrcut6),_mm_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            
+            }
+
+            /* Inner loop uses 82 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_add_ps(_mm_mul_ps(c6_00,sh_vdw_invrcut6),_mm_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            
+            }
+
+            /* Inner loop uses 83 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*83);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_sse2_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }  
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+        
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            
+            }
+
+            /* Inner loop uses 62 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            
+            }
+
+            /* Inner loop uses 63 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*63);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_sse2_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_sse2_single.c
new file mode 100644 (file)
index 0000000..47bf740
--- /dev/null
@@ -0,0 +1,1398 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse2_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_sse2_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_10;
+    __m128           c6grid_20;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }  
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+        
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_add_ps(_mm_mul_ps(c6_00,sh_vdw_invrcut6),_mm_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 174 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_add_ps(_mm_mul_ps(c6_00,sh_vdw_invrcut6),_mm_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 177 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*177);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_sse2_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_10;
+    __m128           c6grid_20;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }  
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+        
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 140 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 143 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*143);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_sse2_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_sse2_single.c
new file mode 100644 (file)
index 0000000..238c93d
--- /dev/null
@@ -0,0 +1,2714 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse2_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_sse2_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_01;
+    __m128           c6grid_02;
+    __m128           c6grid_10;
+    __m128           c6grid_11;
+    __m128           c6grid_12;
+    __m128           c6grid_20;
+    __m128           c6grid_21;
+    __m128           c6grid_22;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_ps(iq0,jq0);
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }  
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+        
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_add_ps(_mm_mul_ps(c6_00,sh_vdw_invrcut6),_mm_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_sub_ps(rinv01,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_sub_ps(rinv02,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            }
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 450 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_add_ps(_mm_mul_ps(c6_00,sh_vdw_invrcut6),_mm_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+            r01              = _mm_andnot_ps(dummy_mask,r01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_sub_ps(rinv01,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+            r02              = _mm_andnot_ps(dummy_mask,r02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_sub_ps(rinv02,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            }
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 459 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*459);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_sse2_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_01;
+    __m128           c6grid_02;
+    __m128           c6grid_10;
+    __m128           c6grid_11;
+    __m128           c6grid_12;
+    __m128           c6grid_20;
+    __m128           c6grid_21;
+    __m128           c6grid_22;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_ps(iq0,jq0);
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }  
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+        
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            }
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 374 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+            r01              = _mm_andnot_ps(dummy_mask,r01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+            r02              = _mm_andnot_ps(dummy_mask,r02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            }
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 383 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*383);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_sse2_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_sse2_single.c
new file mode 100644 (file)
index 0000000..ae898fb
--- /dev/null
@@ -0,0 +1,1578 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse2_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_sse2_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_10;
+    __m128           c6grid_20;
+    __m128           c6grid_30;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }  
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+        
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_add_ps(_mm_mul_ps(c6_00,sh_vdw_invrcut6),_mm_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_sub_ps(rinv30,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 200 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_add_ps(_mm_mul_ps(c6_00,sh_vdw_invrcut6),_mm_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+            r30              = _mm_andnot_ps(dummy_mask,r30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_sub_ps(rinv30,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 204 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*204);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_sse2_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_10;
+    __m128           c6grid_20;
+    __m128           c6grid_30;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }  
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+        
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 166 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+            r30              = _mm_andnot_ps(dummy_mask,r30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 170 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*170);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_sse2_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_sse2_single.c
new file mode 100644 (file)
index 0000000..2bb109c
--- /dev/null
@@ -0,0 +1,2910 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse2_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_sse2_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_11;
+    __m128           c6grid_12;
+    __m128           c6grid_13;
+    __m128           c6grid_21;
+    __m128           c6grid_22;
+    __m128           c6grid_23;
+    __m128           c6grid_31;
+    __m128           c6grid_32;
+    __m128           c6grid_33;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }  
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+        
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_add_ps(_mm_mul_ps(c6_00,sh_vdw_invrcut6),_mm_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_sub_ps(rinv13,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_sub_ps(rinv23,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_sub_ps(rinv31,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_sub_ps(rinv32,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_sub_ps(rinv33,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+            
+            }
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 479 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_add_ps(_mm_mul_ps(c6_00,sh_vdw_invrcut6),_mm_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+            r13              = _mm_andnot_ps(dummy_mask,r13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_sub_ps(rinv13,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+            r23              = _mm_andnot_ps(dummy_mask,r23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_sub_ps(rinv23,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+            r31              = _mm_andnot_ps(dummy_mask,r31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_sub_ps(rinv31,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+            r32              = _mm_andnot_ps(dummy_mask,r32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_sub_ps(rinv32,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+            r33              = _mm_andnot_ps(dummy_mask,r33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_sub_ps(rinv33,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+            
+            }
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 489 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*489);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_sse2_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_11;
+    __m128           c6grid_12;
+    __m128           c6grid_13;
+    __m128           c6grid_21;
+    __m128           c6grid_22;
+    __m128           c6grid_23;
+    __m128           c6grid_31;
+    __m128           c6grid_32;
+    __m128           c6grid_33;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }  
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+        
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+            
+            }
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 403 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+            r13              = _mm_andnot_ps(dummy_mask,r13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+            r23              = _mm_andnot_ps(dummy_mask,r23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+            r31              = _mm_andnot_ps(dummy_mask,r31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+            r32              = _mm_andnot_ps(dummy_mask,r32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+            r33              = _mm_andnot_ps(dummy_mask,r33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+            
+            }
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 413 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*413);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwLJEw_GeomP1P1_sse2_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwLJEw_GeomP1P1_sse2_single.c
new file mode 100644 (file)
index 0000000..999d9b0
--- /dev/null
@@ -0,0 +1,802 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse2_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_sse2_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }  
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+        
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            
+            /* Inner loop uses 69 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            
+            /* Inner loop uses 70 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*70);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_sse2_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }  
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+        
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            
+            /* Inner loop uses 59 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            
+            /* Inner loop uses 60 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*60);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwLJEw_GeomW3P1_sse2_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwLJEw_GeomW3P1_sse2_single.c
new file mode 100644 (file)
index 0000000..b01cd4c
--- /dev/null
@@ -0,0 +1,1264 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse2_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_sse2_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_10;
+    __m128           c6grid_20;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }  
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+        
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 151 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 154 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*154);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_sse2_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_10;
+    __m128           c6grid_20;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }  
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+        
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 131 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 134 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*134);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwLJEw_GeomW3W3_sse2_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwLJEw_GeomW3W3_sse2_single.c
new file mode 100644 (file)
index 0000000..5427e5d
--- /dev/null
@@ -0,0 +1,2352 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse2_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_sse2_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_01;
+    __m128           c6grid_02;
+    __m128           c6grid_10;
+    __m128           c6grid_11;
+    __m128           c6grid_12;
+    __m128           c6grid_20;
+    __m128           c6grid_21;
+    __m128           c6grid_22;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_ps(iq0,jq0);
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }  
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+        
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 397 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+            r01              = _mm_andnot_ps(dummy_mask,r01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+            r02              = _mm_andnot_ps(dummy_mask,r02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 406 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*406);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_sse2_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_01;
+    __m128           c6grid_02;
+    __m128           c6grid_10;
+    __m128           c6grid_11;
+    __m128           c6grid_12;
+    __m128           c6grid_20;
+    __m128           c6grid_21;
+    __m128           c6grid_22;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_ps(iq0,jq0);
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }  
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+        
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 347 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+            r01              = _mm_andnot_ps(dummy_mask,r01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+            r02              = _mm_andnot_ps(dummy_mask,r02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 356 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*356);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwLJEw_GeomW4P1_sse2_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwLJEw_GeomW4P1_sse2_single.c
new file mode 100644 (file)
index 0000000..4a93346
--- /dev/null
@@ -0,0 +1,1408 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse2_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_sse2_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_10;
+    __m128           c6grid_20;
+    __m128           c6grid_30;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }  
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+        
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 174 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+            r30              = _mm_andnot_ps(dummy_mask,r30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 178 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*178);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_sse2_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_10;
+    __m128           c6grid_20;
+    __m128           c6grid_30;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }  
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+        
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 154 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+            r30              = _mm_andnot_ps(dummy_mask,r30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 158 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*158);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwLJEw_GeomW4W4_sse2_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwLJEw_GeomW4W4_sse2_single.c
new file mode 100644 (file)
index 0000000..bd4a521
--- /dev/null
@@ -0,0 +1,2512 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse2_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_sse2_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_11;
+    __m128           c6grid_12;
+    __m128           c6grid_13;
+    __m128           c6grid_21;
+    __m128           c6grid_22;
+    __m128           c6grid_23;
+    __m128           c6grid_31;
+    __m128           c6grid_32;
+    __m128           c6grid_33;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }  
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+        
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+            
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 423 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+            r13              = _mm_andnot_ps(dummy_mask,r13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+            r23              = _mm_andnot_ps(dummy_mask,r23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+            r31              = _mm_andnot_ps(dummy_mask,r31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+            r32              = _mm_andnot_ps(dummy_mask,r32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+            r33              = _mm_andnot_ps(dummy_mask,r33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+            
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 433 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*433);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_sse2_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_sse2_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_11;
+    __m128           c6grid_12;
+    __m128           c6grid_13;
+    __m128           c6grid_21;
+    __m128           c6grid_22;
+    __m128           c6grid_23;
+    __m128           c6grid_31;
+    __m128           c6grid_32;
+    __m128           c6grid_33;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }  
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+        
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+            
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 373 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+            r13              = _mm_andnot_ps(dummy_mask,r13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+            r23              = _mm_andnot_ps(dummy_mask,r23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+            r31              = _mm_andnot_ps(dummy_mask,r31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+            r32              = _mm_andnot_ps(dummy_mask,r32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+            
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+            r33              = _mm_andnot_ps(dummy_mask,r33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
+            gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+            
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 383 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*383);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_sse2_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_sse2_single.c
new file mode 100644 (file)
index 0000000..a6c2c6e
--- /dev/null
@@ -0,0 +1,755 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse2_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: None
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_sse2_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    rcutoff_scalar   = fr->rvdw;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }  
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+        
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_add_ps(_mm_mul_ps(c6_00,sh_vdw_invrcut6),_mm_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            
+            }
+
+            /* Inner loop uses 62 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_add_ps(_mm_mul_ps(c6_00,sh_vdw_invrcut6),_mm_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            
+            }
+
+            /* Inner loop uses 63 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*63);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: None
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_sse2_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    rcutoff_scalar   = fr->rvdw;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }  
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+        
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            
+            }
+
+            /* Inner loop uses 49 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            
+            }
+
+            /* Inner loop uses 50 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 6 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*6 + inneriter*50);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecNone_VdwLJEw_GeomP1P1_sse2_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecNone_VdwLJEw_GeomP1P1_sse2_single.c
new file mode 100644 (file)
index 0000000..f2b7f69
--- /dev/null
@@ -0,0 +1,701 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse2_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse2_single.h"
+#include "kernelutil_x86_sse2_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_sse2_single
+ * Electrostatics interaction: None
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_sse2_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }  
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+        
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            
+            /* Inner loop uses 51 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            
+            /* Inner loop uses 52 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*52);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_sse2_single
+ * Electrostatics interaction: None
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_sse2_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }  
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+        
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            
+            /* Inner loop uses 46 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            
+            /* Inner loop uses 47 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 6 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*6 + inneriter*47);
+}
index 3e9a11fb5715ff01701134c2ee7703394e08bb52..546d1c87bc0714c7c1d005f39105b344d6de6933 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
 
 #include "../nb_kernel.h"
 
+nb_kernel_t nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_sse2_single;
 nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse2_single;
 nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse2_single;
 nb_kernel_t nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sse2_single;
@@ -48,6 +52,16 @@ nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sse2_single;
 nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sse2_single;
 nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sse2_single;
 nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_sse2_single;
 nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sse2_single;
 nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sse2_single;
 nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse2_single;
@@ -78,6 +92,16 @@ nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse2_single;
 nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse2_single;
 nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse2_single;
 nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_sse2_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_sse2_single;
 nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sse2_single;
 nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sse2_single;
 nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse2_single;
@@ -259,6 +283,10 @@ nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sse2_single;
 nb_kernel_info_t
     kernellist_sse2_single[] =
 {
+    { nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_sse2_single, "nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_sse2_single", "sse2_single", "None", "None", "LJEwald", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_sse2_single, "nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_sse2_single", "sse2_single", "None", "None", "LJEwald", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_sse2_single, "nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_sse2_single", "sse2_single", "None", "None", "LJEwald", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_sse2_single, "nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_sse2_single", "sse2_single", "None", "None", "LJEwald", "PotentialShift", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse2_single, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse2_single", "sse2_single", "None", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse2_single, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse2_single", "sse2_single", "None", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sse2_single, "nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sse2_single", "sse2_single", "None", "None", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
@@ -267,6 +295,16 @@ nb_kernel_info_t
     { nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sse2_single, "nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sse2_single", "sse2_single", "None", "None", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sse2_single, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sse2_single", "sse2_single", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sse2_single, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sse2_single", "sse2_single", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_sse2_single, "nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_sse2_single", "sse2_single", "Ewald", "None", "LJEwald", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_sse2_single, "nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_sse2_single", "sse2_single", "Ewald", "None", "LJEwald", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_sse2_single, "nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_sse2_single", "sse2_single", "Ewald", "None", "LJEwald", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_sse2_single, "nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_sse2_single", "sse2_single", "Ewald", "None", "LJEwald", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_sse2_single, "nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_sse2_single", "sse2_single", "Ewald", "None", "LJEwald", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_sse2_single, "nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_sse2_single", "sse2_single", "Ewald", "None", "LJEwald", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_sse2_single, "nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_sse2_single", "sse2_single", "Ewald", "None", "LJEwald", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_sse2_single, "nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_sse2_single", "sse2_single", "Ewald", "None", "LJEwald", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_sse2_single, "nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_sse2_single", "sse2_single", "Ewald", "None", "LJEwald", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_sse2_single, "nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_sse2_single", "sse2_single", "Ewald", "None", "LJEwald", "None", "Water4Water4", "", "Force" },
     { nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sse2_single, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sse2_single", "sse2_single", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sse2_single, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sse2_single", "sse2_single", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse2_single, "nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse2_single", "sse2_single", "Ewald", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
@@ -297,6 +335,16 @@ nb_kernel_info_t
     { nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse2_single, "nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse2_single", "sse2_single", "Ewald", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
     { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse2_single, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse2_single", "sse2_single", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
     { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse2_single, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse2_single", "sse2_single", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_sse2_single, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_sse2_single, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_sse2_single, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_sse2_single, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_sse2_single, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_sse2_single, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_sse2_single, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_sse2_single, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_sse2_single, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_sse2_single, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water4Water4", "", "Force" },
     { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sse2_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sse2_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse2_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse2_single", "sse2_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
index 0cc9cb03907b7a0779ea8089be515441830ce938..6f7f1b37469559094b3969a854521df72db0a07e 100644 (file)
@@ -2,7 +2,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -154,6 +154,15 @@ void
     __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
     real             *vftab;
     /* #endif */
+    /* #if 'LJEwald' in KERNEL_VDW */
+    /* #for I,J in PAIRS_IJ */
+    __m128           c6grid_{I}{J};
+    /* #endfor */
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    /* #endif */
     /* #if 'Ewald' in KERNEL_ELEC */
     __m128i          ewitab;
     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
@@ -192,6 +201,12 @@ void
     vdwparam         = fr->nbfp;
     vdwtype          = mdatoms->typeA;
     /* #endif */
+    /* #if 'LJEwald' in KERNEL_VDW */
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+    /* #endif */
 
     /* #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW */
     vftab            = kernel_data->table_elec_vdw->data;
@@ -248,8 +263,14 @@ void
     qq{I}{J}             = _mm_mul_ps(iq{I},jq{J});
     /*         #endif */
     /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
+    /*             #if 'LJEwald' in KERNEL_VDW */
     c6_{I}{J}            = _mm_set1_ps(vdwparam[vdwioffset{I}+vdwjidx{J}A]);
     c12_{I}{J}           = _mm_set1_ps(vdwparam[vdwioffset{I}+vdwjidx{J}A+1]);
+    c6grid_{I}{J}        = _mm_set1_ps(vdwgridparam[vdwioffset{I}+vdwjidx{J}A]);
+    /*             #else */
+    c6_{I}{J}            = _mm_set1_ps(vdwparam[vdwioffset{I}+vdwjidx{J}A]);
+    c12_{I}{J}           = _mm_set1_ps(vdwparam[vdwioffset{I}+vdwjidx{J}A+1]);
+    /*             #endif */
     /*         #endif */
     /*     #endfor */
     /* #endif */
@@ -540,6 +561,12 @@ void
                                          vdwparam+vdwioffset{I}+vdwjidx{J}C,
                                          vdwparam+vdwioffset{I}+vdwjidx{J}D,
                                          &c6_{I}{J},&c12_{I}{J});
+            /*         #if 'LJEwald' in KERNEL_VDW */
+            c6grid_{I}{J}       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset{I}+vdwjidx{J}A,
+                                                               vdwgridparam+vdwioffset{I}+vdwjidx{J}B,
+                                                               vdwgridparam+vdwioffset{I}+vdwjidx{J}C,
+                                                               vdwgridparam+vdwioffset{I}+vdwjidx{J}D);
+            /*          #endif */
             /*         #endif */
             /*     #endif */
 
@@ -792,6 +819,45 @@ void
             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv{I}{J})));
             /*                 #define INNERFLOPS INNERFLOPS+4 */
             /*             #endif */
+
+            /*         #elif KERNEL_VDW=='LJEwald' */
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq{I}{J},rinvsq{I}{J}),rinvsq{I}{J});
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq{I}{J});
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /*                 #define INNERFLOPS INNERFLOPS+11 */
+            /*             #if 'Potential' in KERNEL_VF or KERNEL_MOD_VDW=='PotentialSwitch' */
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_{I}{J},_mm_mul_ps(c6grid_{I}{J},_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_{I}{J},_mm_mul_ps(rinvsix,rinvsix));
+            /*                 #define INNERFLOPS INNERFLOPS+6 */
+            /*                 #if KERNEL_MOD_VDW=='PotentialShift' */
+            vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_{I}{J},_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
+                                          _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_add_ps(_mm_mul_ps(c6_{I}{J},sh_vdw_invrcut6),_mm_mul_ps(c6grid_{I}{J},sh_lj_ewald))),one_sixth));
+            /*                     #define INNERFLOPS INNERFLOPS+10 */
+            /*                 #else */
+            vvdw             = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
+            /*                 #define INNERFLOPS INNERFLOPS+3 */
+            /*                 #endif */
+            /*                  ## Check for force inside potential check, i.e. this means we already did the potential part */
+            /*                  #if 'Force' in KERNEL_VF */
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_{I}{J},one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq{I}{J});
+            /*                 #define INNERFLOPS INNERFLOPS+6 */
+            /*                  #endif */
+            /*              #elif KERNEL_VF=='Force' */
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_{I}{J},_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_{I}{J},one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_{I}{J},rinvsix),_mm_sub_ps(c6_{I}{J},f6A)),rinvsix),f6B),rinvsq{I}{J});
+            /*                 #define INNERFLOPS INNERFLOPS+11 */
+            /*              #endif */
             /*         #endif */
             /*         ## End of check for vdw interaction forms */
             /*     #endif */
index f45b1fbfb1b4d806a98893a16d7f518c64c797d5..e21524a9fff67c8c1c3129904a70299ad3aae12b 100755 (executable)
@@ -2,7 +2,7 @@
 #
 # This file is part of the GROMACS molecular simulation package.
 #
-# Copyright (c) 2012,2013, by the GROMACS development team, led by
+# Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
 # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 # and including many others, as listed in the AUTHORS file in the
 # top-level source directory and at http://www.gromacs.org.
@@ -120,6 +120,7 @@ VdwList = {
     'LennardJones'            : ['rinvsq'],
 #    'Buckingham'              : ['rinv','rinvsq','r'], # Disabled for sse2 to reduce number of kernels and simply the template 
     'CubicSplineTable'        : ['rinv','r','table'],
+    'LJEwald'                 : ['rinv','rinvsq','r'],
 }
 
 
@@ -193,6 +194,7 @@ Abbreviation = {
     'CubicSplineTable'        : 'CSTab',
     'LennardJones'            : 'LJ',
     'Buckingham'              : 'Bham',
+    'LJEwald'                 : 'LJEw',
     'PotentialShift'          : 'Sh',
     'PotentialSwitch'         : 'Sw',
     'ExactCutoff'             : 'Cut',
@@ -282,7 +284,11 @@ def KeepKernel(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,Kernel
         return 0
     # For Vdw, we support switch and shift for Lennard-Jones/Buckingham
     if((KernelVdwMod=='ExactCutoff') or
-       (KernelVdwMod in ['PotentialShift','PotentialSwitch'] and KernelVdw not in ['LennardJones','Buckingham'])):
+       (KernelVdwMod in ['PotentialShift','PotentialSwitch'] and KernelVdw not in ['LennardJones','Buckingham','LJEwald'])):
+        return 0
+
+    # For LJEwald, we only support shift
+    if(KernelVdw=='LJEwald' and KernelVdwMod=='PotentialSwitch'):
         return 0
 
     # Choose either switch or shift and don't mix them...
@@ -302,6 +308,10 @@ def KeepKernel(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,Kernel
         elif(KernelElec!='ReactionField'):
             return 0
 
+    #Only do LJ-PME if we are also doing PME for electrostatics, or no electrostatics at all.
+    if(KernelVdw=='LJEwald' and KernelElec not in ['Ewald','None']):
+        return 0
+
     return 1
 
 
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_sse4_1_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_sse4_1_double.c
new file mode 100644 (file)
index 0000000..dd87bac
--- /dev/null
@@ -0,0 +1,741 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse4_1_double.h"
+#include "kernelutil_x86_sse4_1_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_sse4_1_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_sse4_1_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_pd(facel,_mm_load1_pd(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_pd();
+        vvdwsum          = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq00,_mm_sub_pd(_mm_sub_pd(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+           vvdw             = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
+                              _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_add_pd(_mm_mul_pd(c6_00,sh_vdw_invrcut6),_mm_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+            vvdw             = _mm_and_pd(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 81 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _mm_load_sd(charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq00,_mm_sub_pd(_mm_sub_pd(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+           vvdw             = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
+                              _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_add_pd(_mm_mul_pd(c6_00,sh_vdw_invrcut6),_mm_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+            vvdw             = _mm_and_pd(vvdw,cutoff_mask);
+            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 81 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*81);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_sse4_1_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_sse4_1_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_pd(facel,_mm_load1_pd(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 62 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _mm_load_sd(charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 62 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*62);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_sse4_1_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_sse4_1_double.c
new file mode 100644 (file)
index 0000000..ef037e4
--- /dev/null
@@ -0,0 +1,1269 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse4_1_double.h"
+#include "kernelutil_x86_sse4_1_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_sse4_1_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_sse4_1_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_10;
+    __m128d           c6grid_20;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_pd();
+        vvdwsum          = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq00,_mm_sub_pd(_mm_sub_pd(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+           vvdw             = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
+                              _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_add_pd(_mm_mul_pd(c6_00,sh_vdw_invrcut6),_mm_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+            vvdw             = _mm_and_pd(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq10,_mm_sub_pd(_mm_sub_pd(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq20,_mm_sub_pd(_mm_sub_pd(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 176 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _mm_load_sd(charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq00,_mm_sub_pd(_mm_sub_pd(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+           vvdw             = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
+                              _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_add_pd(_mm_mul_pd(c6_00,sh_vdw_invrcut6),_mm_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+            vvdw             = _mm_and_pd(vvdw,cutoff_mask);
+            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq10,_mm_sub_pd(_mm_sub_pd(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq20,_mm_sub_pd(_mm_sub_pd(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 176 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*176);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_sse4_1_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_sse4_1_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_10;
+    __m128d           c6grid_20;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 143 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _mm_load_sd(charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 143 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*143);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_sse4_1_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_sse4_1_double.c
new file mode 100644 (file)
index 0000000..631758f
--- /dev/null
@@ -0,0 +1,2599 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse4_1_double.h"
+#include "kernelutil_x86_sse4_1_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_sse4_1_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_sse4_1_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_01;
+    __m128d           c6grid_02;
+    __m128d           c6grid_10;
+    __m128d           c6grid_11;
+    __m128d           c6grid_12;
+    __m128d           c6grid_20;
+    __m128d           c6grid_21;
+    __m128d           c6grid_22;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_pd(charge[inr+0]);
+    jq1              = _mm_set1_pd(charge[inr+1]);
+    jq2              = _mm_set1_pd(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_pd(iq0,jq0);
+    c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_pd(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq01             = _mm_mul_pd(iq0,jq1);
+    qq02             = _mm_mul_pd(iq0,jq2);
+    qq10             = _mm_mul_pd(iq1,jq0);
+    qq11             = _mm_mul_pd(iq1,jq1);
+    qq12             = _mm_mul_pd(iq1,jq2);
+    qq20             = _mm_mul_pd(iq2,jq0);
+    qq21             = _mm_mul_pd(iq2,jq1);
+    qq22             = _mm_mul_pd(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_pd();
+        vvdwsum          = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx01             = _mm_sub_pd(ix0,jx1);
+            dy01             = _mm_sub_pd(iy0,jy1);
+            dz01             = _mm_sub_pd(iz0,jz1);
+            dx02             = _mm_sub_pd(ix0,jx2);
+            dy02             = _mm_sub_pd(iy0,jy2);
+            dz02             = _mm_sub_pd(iz0,jz2);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv01           = gmx_mm_invsqrt_pd(rsq01);
+            rinv02           = gmx_mm_invsqrt_pd(rsq02);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq01         = _mm_mul_pd(rinv01,rinv01);
+            rinvsq02         = _mm_mul_pd(rinv02,rinv02);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq00,_mm_sub_pd(_mm_sub_pd(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+           vvdw             = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
+                              _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_add_pd(_mm_mul_pd(c6_00,sh_vdw_invrcut6),_mm_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+            vvdw             = _mm_and_pd(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm_mul_pd(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r01,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq01,_mm_sub_pd(_mm_sub_pd(rinv01,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx01);
+            ty               = _mm_mul_pd(fscal,dy01);
+            tz               = _mm_mul_pd(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm_mul_pd(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r02,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq02,_mm_sub_pd(_mm_sub_pd(rinv02,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx02);
+            ty               = _mm_mul_pd(fscal,dy02);
+            tz               = _mm_mul_pd(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq10,_mm_sub_pd(_mm_sub_pd(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_sub_pd(rinv11,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx11);
+            ty               = _mm_mul_pd(fscal,dy11);
+            tz               = _mm_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_sub_pd(rinv12,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx12);
+            ty               = _mm_mul_pd(fscal,dy12);
+            tz               = _mm_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq20,_mm_sub_pd(_mm_sub_pd(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_sub_pd(rinv21,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx21);
+            ty               = _mm_mul_pd(fscal,dy21);
+            tz               = _mm_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_sub_pd(rinv22,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx22);
+            ty               = _mm_mul_pd(fscal,dy22);
+            tz               = _mm_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 449 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx01             = _mm_sub_pd(ix0,jx1);
+            dy01             = _mm_sub_pd(iy0,jy1);
+            dz01             = _mm_sub_pd(iz0,jz1);
+            dx02             = _mm_sub_pd(ix0,jx2);
+            dy02             = _mm_sub_pd(iy0,jy2);
+            dz02             = _mm_sub_pd(iz0,jz2);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv01           = gmx_mm_invsqrt_pd(rsq01);
+            rinv02           = gmx_mm_invsqrt_pd(rsq02);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq01         = _mm_mul_pd(rinv01,rinv01);
+            rinvsq02         = _mm_mul_pd(rinv02,rinv02);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq00,_mm_sub_pd(_mm_sub_pd(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+           vvdw             = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
+                              _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_add_pd(_mm_mul_pd(c6_00,sh_vdw_invrcut6),_mm_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+            vvdw             = _mm_and_pd(vvdw,cutoff_mask);
+            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm_mul_pd(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r01,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq01,_mm_sub_pd(_mm_sub_pd(rinv01,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx01);
+            ty               = _mm_mul_pd(fscal,dy01);
+            tz               = _mm_mul_pd(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm_mul_pd(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r02,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq02,_mm_sub_pd(_mm_sub_pd(rinv02,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx02);
+            ty               = _mm_mul_pd(fscal,dy02);
+            tz               = _mm_mul_pd(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq10,_mm_sub_pd(_mm_sub_pd(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_sub_pd(rinv11,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx11);
+            ty               = _mm_mul_pd(fscal,dy11);
+            tz               = _mm_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_sub_pd(rinv12,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx12);
+            ty               = _mm_mul_pd(fscal,dy12);
+            tz               = _mm_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq20,_mm_sub_pd(_mm_sub_pd(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_sub_pd(rinv21,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx21);
+            ty               = _mm_mul_pd(fscal,dy21);
+            tz               = _mm_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_sub_pd(rinv22,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx22);
+            ty               = _mm_mul_pd(fscal,dy22);
+            tz               = _mm_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 449 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*449);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_sse4_1_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_sse4_1_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_01;
+    __m128d           c6grid_02;
+    __m128d           c6grid_10;
+    __m128d           c6grid_11;
+    __m128d           c6grid_12;
+    __m128d           c6grid_20;
+    __m128d           c6grid_21;
+    __m128d           c6grid_22;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_pd(charge[inr+0]);
+    jq1              = _mm_set1_pd(charge[inr+1]);
+    jq2              = _mm_set1_pd(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_pd(iq0,jq0);
+    c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_pd(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq01             = _mm_mul_pd(iq0,jq1);
+    qq02             = _mm_mul_pd(iq0,jq2);
+    qq10             = _mm_mul_pd(iq1,jq0);
+    qq11             = _mm_mul_pd(iq1,jq1);
+    qq12             = _mm_mul_pd(iq1,jq2);
+    qq20             = _mm_mul_pd(iq2,jq0);
+    qq21             = _mm_mul_pd(iq2,jq1);
+    qq22             = _mm_mul_pd(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx01             = _mm_sub_pd(ix0,jx1);
+            dy01             = _mm_sub_pd(iy0,jy1);
+            dz01             = _mm_sub_pd(iz0,jz1);
+            dx02             = _mm_sub_pd(ix0,jx2);
+            dy02             = _mm_sub_pd(iy0,jy2);
+            dz02             = _mm_sub_pd(iz0,jz2);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv01           = gmx_mm_invsqrt_pd(rsq01);
+            rinv02           = gmx_mm_invsqrt_pd(rsq02);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq01         = _mm_mul_pd(rinv01,rinv01);
+            rinvsq02         = _mm_mul_pd(rinv02,rinv02);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm_mul_pd(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r01,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx01);
+            ty               = _mm_mul_pd(fscal,dy01);
+            tz               = _mm_mul_pd(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm_mul_pd(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r02,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx02);
+            ty               = _mm_mul_pd(fscal,dy02);
+            tz               = _mm_mul_pd(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx11);
+            ty               = _mm_mul_pd(fscal,dy11);
+            tz               = _mm_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx12);
+            ty               = _mm_mul_pd(fscal,dy12);
+            tz               = _mm_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx21);
+            ty               = _mm_mul_pd(fscal,dy21);
+            tz               = _mm_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx22);
+            ty               = _mm_mul_pd(fscal,dy22);
+            tz               = _mm_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 374 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx01             = _mm_sub_pd(ix0,jx1);
+            dy01             = _mm_sub_pd(iy0,jy1);
+            dz01             = _mm_sub_pd(iz0,jz1);
+            dx02             = _mm_sub_pd(ix0,jx2);
+            dy02             = _mm_sub_pd(iy0,jy2);
+            dz02             = _mm_sub_pd(iz0,jz2);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv01           = gmx_mm_invsqrt_pd(rsq01);
+            rinv02           = gmx_mm_invsqrt_pd(rsq02);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq01         = _mm_mul_pd(rinv01,rinv01);
+            rinvsq02         = _mm_mul_pd(rinv02,rinv02);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm_mul_pd(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r01,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx01);
+            ty               = _mm_mul_pd(fscal,dy01);
+            tz               = _mm_mul_pd(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm_mul_pd(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r02,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx02);
+            ty               = _mm_mul_pd(fscal,dy02);
+            tz               = _mm_mul_pd(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx11);
+            ty               = _mm_mul_pd(fscal,dy11);
+            tz               = _mm_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx12);
+            ty               = _mm_mul_pd(fscal,dy12);
+            tz               = _mm_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx21);
+            ty               = _mm_mul_pd(fscal,dy21);
+            tz               = _mm_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx22);
+            ty               = _mm_mul_pd(fscal,dy22);
+            tz               = _mm_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 374 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*374);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_sse4_1_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_sse4_1_double.c
new file mode 100644 (file)
index 0000000..bf3dc0e
--- /dev/null
@@ -0,0 +1,1447 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse4_1_double.h"
+#include "kernelutil_x86_sse4_1_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_sse4_1_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_sse4_1_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128d          dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_10;
+    __m128d           c6grid_20;
+    __m128d           c6grid_30;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+        fix3             = _mm_setzero_pd();
+        fiy3             = _mm_setzero_pd();
+        fiz3             = _mm_setzero_pd();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_pd();
+        vvdwsum          = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx30             = _mm_sub_pd(ix3,jx0);
+            dy30             = _mm_sub_pd(iy3,jy0);
+            dz30             = _mm_sub_pd(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv30           = gmx_mm_invsqrt_pd(rsq30);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq30         = _mm_mul_pd(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+           vvdw             = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
+                              _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_add_pd(_mm_mul_pd(c6_00,sh_vdw_invrcut6),_mm_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_pd(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq10,_mm_sub_pd(_mm_sub_pd(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq20,_mm_sub_pd(_mm_sub_pd(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm_mul_pd(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_pd(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r30,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq30,_mm_sub_pd(_mm_sub_pd(rinv30,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx30);
+            ty               = _mm_mul_pd(fscal,dy30);
+            tz               = _mm_mul_pd(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 202 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx30             = _mm_sub_pd(ix3,jx0);
+            dy30             = _mm_sub_pd(iy3,jy0);
+            dz30             = _mm_sub_pd(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv30           = gmx_mm_invsqrt_pd(rsq30);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq30         = _mm_mul_pd(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _mm_load_sd(charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+           vvdw             = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
+                              _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_add_pd(_mm_mul_pd(c6_00,sh_vdw_invrcut6),_mm_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_pd(vvdw,cutoff_mask);
+            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq10,_mm_sub_pd(_mm_sub_pd(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq20,_mm_sub_pd(_mm_sub_pd(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm_mul_pd(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_pd(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r30,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq30,_mm_sub_pd(_mm_sub_pd(rinv30,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx30);
+            ty               = _mm_mul_pd(fscal,dy30);
+            tz               = _mm_mul_pd(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 202 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*202);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_sse4_1_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_sse4_1_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128d          dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_10;
+    __m128d           c6grid_20;
+    __m128d           c6grid_30;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+        fix3             = _mm_setzero_pd();
+        fiy3             = _mm_setzero_pd();
+        fiz3             = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx30             = _mm_sub_pd(ix3,jx0);
+            dy30             = _mm_sub_pd(iy3,jy0);
+            dz30             = _mm_sub_pd(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv30           = gmx_mm_invsqrt_pd(rsq30);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq30         = _mm_mul_pd(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm_mul_pd(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_pd(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r30,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx30);
+            ty               = _mm_mul_pd(fscal,dy30);
+            tz               = _mm_mul_pd(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 169 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx30             = _mm_sub_pd(ix3,jx0);
+            dy30             = _mm_sub_pd(iy3,jy0);
+            dz30             = _mm_sub_pd(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv30           = gmx_mm_invsqrt_pd(rsq30);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq30         = _mm_mul_pd(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _mm_load_sd(charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm_mul_pd(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_pd(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r30,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx30);
+            ty               = _mm_mul_pd(fscal,dy30);
+            tz               = _mm_mul_pd(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 169 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*169);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_sse4_1_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_sse4_1_double.c
new file mode 100644 (file)
index 0000000..cc8a50a
--- /dev/null
@@ -0,0 +1,2789 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse4_1_double.h"
+#include "kernelutil_x86_sse4_1_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_sse4_1_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_sse4_1_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_11;
+    __m128d           c6grid_12;
+    __m128d           c6grid_13;
+    __m128d           c6grid_21;
+    __m128d           c6grid_22;
+    __m128d           c6grid_23;
+    __m128d           c6grid_31;
+    __m128d           c6grid_32;
+    __m128d           c6grid_33;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_pd(charge[inr+1]);
+    jq2              = _mm_set1_pd(charge[inr+2]);
+    jq3              = _mm_set1_pd(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_pd(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq11             = _mm_mul_pd(iq1,jq1);
+    qq12             = _mm_mul_pd(iq1,jq2);
+    qq13             = _mm_mul_pd(iq1,jq3);
+    qq21             = _mm_mul_pd(iq2,jq1);
+    qq22             = _mm_mul_pd(iq2,jq2);
+    qq23             = _mm_mul_pd(iq2,jq3);
+    qq31             = _mm_mul_pd(iq3,jq1);
+    qq32             = _mm_mul_pd(iq3,jq2);
+    qq33             = _mm_mul_pd(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+        fix3             = _mm_setzero_pd();
+        fiy3             = _mm_setzero_pd();
+        fiz3             = _mm_setzero_pd();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_pd();
+        vvdwsum          = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx13             = _mm_sub_pd(ix1,jx3);
+            dy13             = _mm_sub_pd(iy1,jy3);
+            dz13             = _mm_sub_pd(iz1,jz3);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+            dx23             = _mm_sub_pd(ix2,jx3);
+            dy23             = _mm_sub_pd(iy2,jy3);
+            dz23             = _mm_sub_pd(iz2,jz3);
+            dx31             = _mm_sub_pd(ix3,jx1);
+            dy31             = _mm_sub_pd(iy3,jy1);
+            dz31             = _mm_sub_pd(iz3,jz1);
+            dx32             = _mm_sub_pd(ix3,jx2);
+            dy32             = _mm_sub_pd(iy3,jy2);
+            dz32             = _mm_sub_pd(iz3,jz2);
+            dx33             = _mm_sub_pd(ix3,jx3);
+            dy33             = _mm_sub_pd(iy3,jy3);
+            dz33             = _mm_sub_pd(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv13           = gmx_mm_invsqrt_pd(rsq13);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+            rinv23           = gmx_mm_invsqrt_pd(rsq23);
+            rinv31           = gmx_mm_invsqrt_pd(rsq31);
+            rinv32           = gmx_mm_invsqrt_pd(rsq32);
+            rinv33           = gmx_mm_invsqrt_pd(rsq33);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq13         = _mm_mul_pd(rinv13,rinv13);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+            rinvsq23         = _mm_mul_pd(rinv23,rinv23);
+            rinvsq31         = _mm_mul_pd(rinv31,rinv31);
+            rinvsq32         = _mm_mul_pd(rinv32,rinv32);
+            rinvsq33         = _mm_mul_pd(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+            fjx3             = _mm_setzero_pd();
+            fjy3             = _mm_setzero_pd();
+            fjz3             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+           vvdw             = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
+                              _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_add_pd(_mm_mul_pd(c6_00,sh_vdw_invrcut6),_mm_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_pd(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_sub_pd(rinv11,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx11);
+            ty               = _mm_mul_pd(fscal,dy11);
+            tz               = _mm_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_sub_pd(rinv12,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx12);
+            ty               = _mm_mul_pd(fscal,dy12);
+            tz               = _mm_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm_mul_pd(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r13,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq13,_mm_sub_pd(_mm_sub_pd(rinv13,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx13);
+            ty               = _mm_mul_pd(fscal,dy13);
+            tz               = _mm_mul_pd(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_sub_pd(rinv21,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx21);
+            ty               = _mm_mul_pd(fscal,dy21);
+            tz               = _mm_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_sub_pd(rinv22,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx22);
+            ty               = _mm_mul_pd(fscal,dy22);
+            tz               = _mm_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm_mul_pd(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r23,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq23,_mm_sub_pd(_mm_sub_pd(rinv23,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx23);
+            ty               = _mm_mul_pd(fscal,dy23);
+            tz               = _mm_mul_pd(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm_mul_pd(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r31,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq31,_mm_sub_pd(_mm_sub_pd(rinv31,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx31);
+            ty               = _mm_mul_pd(fscal,dy31);
+            tz               = _mm_mul_pd(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm_mul_pd(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r32,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq32,_mm_sub_pd(_mm_sub_pd(rinv32,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx32);
+            ty               = _mm_mul_pd(fscal,dy32);
+            tz               = _mm_mul_pd(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm_mul_pd(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r33,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq33,_mm_sub_pd(_mm_sub_pd(rinv33,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx33);
+            ty               = _mm_mul_pd(fscal,dy33);
+            tz               = _mm_mul_pd(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            }
+
+            gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 478 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx13             = _mm_sub_pd(ix1,jx3);
+            dy13             = _mm_sub_pd(iy1,jy3);
+            dz13             = _mm_sub_pd(iz1,jz3);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+            dx23             = _mm_sub_pd(ix2,jx3);
+            dy23             = _mm_sub_pd(iy2,jy3);
+            dz23             = _mm_sub_pd(iz2,jz3);
+            dx31             = _mm_sub_pd(ix3,jx1);
+            dy31             = _mm_sub_pd(iy3,jy1);
+            dz31             = _mm_sub_pd(iz3,jz1);
+            dx32             = _mm_sub_pd(ix3,jx2);
+            dy32             = _mm_sub_pd(iy3,jy2);
+            dz32             = _mm_sub_pd(iz3,jz2);
+            dx33             = _mm_sub_pd(ix3,jx3);
+            dy33             = _mm_sub_pd(iy3,jy3);
+            dz33             = _mm_sub_pd(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv13           = gmx_mm_invsqrt_pd(rsq13);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+            rinv23           = gmx_mm_invsqrt_pd(rsq23);
+            rinv31           = gmx_mm_invsqrt_pd(rsq31);
+            rinv32           = gmx_mm_invsqrt_pd(rsq32);
+            rinv33           = gmx_mm_invsqrt_pd(rsq33);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq13         = _mm_mul_pd(rinv13,rinv13);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+            rinvsq23         = _mm_mul_pd(rinv23,rinv23);
+            rinvsq31         = _mm_mul_pd(rinv31,rinv31);
+            rinvsq32         = _mm_mul_pd(rinv32,rinv32);
+            rinvsq33         = _mm_mul_pd(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+            fjx3             = _mm_setzero_pd();
+            fjy3             = _mm_setzero_pd();
+            fjz3             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+           vvdw             = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
+                              _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_add_pd(_mm_mul_pd(c6_00,sh_vdw_invrcut6),_mm_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_pd(vvdw,cutoff_mask);
+            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq11,_mm_sub_pd(_mm_sub_pd(rinv11,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx11);
+            ty               = _mm_mul_pd(fscal,dy11);
+            tz               = _mm_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq12,_mm_sub_pd(_mm_sub_pd(rinv12,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx12);
+            ty               = _mm_mul_pd(fscal,dy12);
+            tz               = _mm_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm_mul_pd(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r13,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq13,_mm_sub_pd(_mm_sub_pd(rinv13,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx13);
+            ty               = _mm_mul_pd(fscal,dy13);
+            tz               = _mm_mul_pd(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq21,_mm_sub_pd(_mm_sub_pd(rinv21,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx21);
+            ty               = _mm_mul_pd(fscal,dy21);
+            tz               = _mm_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq22,_mm_sub_pd(_mm_sub_pd(rinv22,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx22);
+            ty               = _mm_mul_pd(fscal,dy22);
+            tz               = _mm_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm_mul_pd(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r23,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq23,_mm_sub_pd(_mm_sub_pd(rinv23,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx23);
+            ty               = _mm_mul_pd(fscal,dy23);
+            tz               = _mm_mul_pd(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm_mul_pd(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r31,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq31,_mm_sub_pd(_mm_sub_pd(rinv31,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx31);
+            ty               = _mm_mul_pd(fscal,dy31);
+            tz               = _mm_mul_pd(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm_mul_pd(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r32,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq32,_mm_sub_pd(_mm_sub_pd(rinv32,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx32);
+            ty               = _mm_mul_pd(fscal,dy32);
+            tz               = _mm_mul_pd(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm_mul_pd(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r33,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq33,_mm_sub_pd(_mm_sub_pd(rinv33,sh_ewald),velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_pd(velec,cutoff_mask);
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx33);
+            ty               = _mm_mul_pd(fscal,dy33);
+            tz               = _mm_mul_pd(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            }
+
+            gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 478 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*478);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_sse4_1_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_sse4_1_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_11;
+    __m128d           c6grid_12;
+    __m128d           c6grid_13;
+    __m128d           c6grid_21;
+    __m128d           c6grid_22;
+    __m128d           c6grid_23;
+    __m128d           c6grid_31;
+    __m128d           c6grid_32;
+    __m128d           c6grid_33;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_pd(charge[inr+1]);
+    jq2              = _mm_set1_pd(charge[inr+2]);
+    jq3              = _mm_set1_pd(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_pd(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq11             = _mm_mul_pd(iq1,jq1);
+    qq12             = _mm_mul_pd(iq1,jq2);
+    qq13             = _mm_mul_pd(iq1,jq3);
+    qq21             = _mm_mul_pd(iq2,jq1);
+    qq22             = _mm_mul_pd(iq2,jq2);
+    qq23             = _mm_mul_pd(iq2,jq3);
+    qq31             = _mm_mul_pd(iq3,jq1);
+    qq32             = _mm_mul_pd(iq3,jq2);
+    qq33             = _mm_mul_pd(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+        fix3             = _mm_setzero_pd();
+        fiy3             = _mm_setzero_pd();
+        fiz3             = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx13             = _mm_sub_pd(ix1,jx3);
+            dy13             = _mm_sub_pd(iy1,jy3);
+            dz13             = _mm_sub_pd(iz1,jz3);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+            dx23             = _mm_sub_pd(ix2,jx3);
+            dy23             = _mm_sub_pd(iy2,jy3);
+            dz23             = _mm_sub_pd(iz2,jz3);
+            dx31             = _mm_sub_pd(ix3,jx1);
+            dy31             = _mm_sub_pd(iy3,jy1);
+            dz31             = _mm_sub_pd(iz3,jz1);
+            dx32             = _mm_sub_pd(ix3,jx2);
+            dy32             = _mm_sub_pd(iy3,jy2);
+            dz32             = _mm_sub_pd(iz3,jz2);
+            dx33             = _mm_sub_pd(ix3,jx3);
+            dy33             = _mm_sub_pd(iy3,jy3);
+            dz33             = _mm_sub_pd(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv13           = gmx_mm_invsqrt_pd(rsq13);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+            rinv23           = gmx_mm_invsqrt_pd(rsq23);
+            rinv31           = gmx_mm_invsqrt_pd(rsq31);
+            rinv32           = gmx_mm_invsqrt_pd(rsq32);
+            rinv33           = gmx_mm_invsqrt_pd(rsq33);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq13         = _mm_mul_pd(rinv13,rinv13);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+            rinvsq23         = _mm_mul_pd(rinv23,rinv23);
+            rinvsq31         = _mm_mul_pd(rinv31,rinv31);
+            rinvsq32         = _mm_mul_pd(rinv32,rinv32);
+            rinvsq33         = _mm_mul_pd(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+            fjx3             = _mm_setzero_pd();
+            fjy3             = _mm_setzero_pd();
+            fjz3             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx11);
+            ty               = _mm_mul_pd(fscal,dy11);
+            tz               = _mm_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx12);
+            ty               = _mm_mul_pd(fscal,dy12);
+            tz               = _mm_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm_mul_pd(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r13,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx13);
+            ty               = _mm_mul_pd(fscal,dy13);
+            tz               = _mm_mul_pd(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx21);
+            ty               = _mm_mul_pd(fscal,dy21);
+            tz               = _mm_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx22);
+            ty               = _mm_mul_pd(fscal,dy22);
+            tz               = _mm_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm_mul_pd(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r23,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx23);
+            ty               = _mm_mul_pd(fscal,dy23);
+            tz               = _mm_mul_pd(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm_mul_pd(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r31,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx31);
+            ty               = _mm_mul_pd(fscal,dy31);
+            tz               = _mm_mul_pd(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm_mul_pd(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r32,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx32);
+            ty               = _mm_mul_pd(fscal,dy32);
+            tz               = _mm_mul_pd(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm_mul_pd(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r33,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx33);
+            ty               = _mm_mul_pd(fscal,dy33);
+            tz               = _mm_mul_pd(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            }
+
+            gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 403 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx13             = _mm_sub_pd(ix1,jx3);
+            dy13             = _mm_sub_pd(iy1,jy3);
+            dz13             = _mm_sub_pd(iz1,jz3);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+            dx23             = _mm_sub_pd(ix2,jx3);
+            dy23             = _mm_sub_pd(iy2,jy3);
+            dz23             = _mm_sub_pd(iz2,jz3);
+            dx31             = _mm_sub_pd(ix3,jx1);
+            dy31             = _mm_sub_pd(iy3,jy1);
+            dz31             = _mm_sub_pd(iz3,jz1);
+            dx32             = _mm_sub_pd(ix3,jx2);
+            dy32             = _mm_sub_pd(iy3,jy2);
+            dz32             = _mm_sub_pd(iz3,jz2);
+            dx33             = _mm_sub_pd(ix3,jx3);
+            dy33             = _mm_sub_pd(iy3,jy3);
+            dz33             = _mm_sub_pd(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv13           = gmx_mm_invsqrt_pd(rsq13);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+            rinv23           = gmx_mm_invsqrt_pd(rsq23);
+            rinv31           = gmx_mm_invsqrt_pd(rsq31);
+            rinv32           = gmx_mm_invsqrt_pd(rsq32);
+            rinv33           = gmx_mm_invsqrt_pd(rsq33);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq13         = _mm_mul_pd(rinv13,rinv13);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+            rinvsq23         = _mm_mul_pd(rinv23,rinv23);
+            rinvsq31         = _mm_mul_pd(rinv31,rinv31);
+            rinvsq32         = _mm_mul_pd(rinv32,rinv32);
+            rinvsq33         = _mm_mul_pd(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+            fjx3             = _mm_setzero_pd();
+            fjy3             = _mm_setzero_pd();
+            fjz3             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx11);
+            ty               = _mm_mul_pd(fscal,dy11);
+            tz               = _mm_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx12);
+            ty               = _mm_mul_pd(fscal,dy12);
+            tz               = _mm_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm_mul_pd(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r13,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx13);
+            ty               = _mm_mul_pd(fscal,dy13);
+            tz               = _mm_mul_pd(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx21);
+            ty               = _mm_mul_pd(fscal,dy21);
+            tz               = _mm_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx22);
+            ty               = _mm_mul_pd(fscal,dy22);
+            tz               = _mm_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm_mul_pd(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r23,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx23);
+            ty               = _mm_mul_pd(fscal,dy23);
+            tz               = _mm_mul_pd(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm_mul_pd(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r31,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx31);
+            ty               = _mm_mul_pd(fscal,dy31);
+            tz               = _mm_mul_pd(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm_mul_pd(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r32,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx32);
+            ty               = _mm_mul_pd(fscal,dy32);
+            tz               = _mm_mul_pd(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm_mul_pd(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r33,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
+
+            cutoff_mask      = _mm_cmplt_pd(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx33);
+            ty               = _mm_mul_pd(fscal,dy33);
+            tz               = _mm_mul_pd(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            }
+
+            gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 403 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*403);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecEw_VdwLJEw_GeomP1P1_sse4_1_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecEw_VdwLJEw_GeomP1P1_sse4_1_double.c
new file mode 100644 (file)
index 0000000..47c0ea6
--- /dev/null
@@ -0,0 +1,683 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse4_1_double.h"
+#include "kernelutil_x86_sse4_1_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_sse4_1_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_sse4_1_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_pd(facel,_mm_load1_pd(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_pd();
+        vvdwsum          = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_sub_pd(_mm_mul_pd(vvdw12,one_twelfth),_mm_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,tx,ty,tz);
+
+            /* Inner loop uses 69 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _mm_load_sd(charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_sub_pd(_mm_mul_pd(vvdw12,one_twelfth),_mm_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,tx,ty,tz);
+
+            /* Inner loop uses 69 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*69);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_sse4_1_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_sse4_1_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_pd(facel,_mm_load1_pd(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,tx,ty,tz);
+
+            /* Inner loop uses 59 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _mm_load_sd(charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,tx,ty,tz);
+
+            /* Inner loop uses 59 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*59);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecEw_VdwLJEw_GeomW3P1_sse4_1_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecEw_VdwLJEw_GeomW3P1_sse4_1_double.c
new file mode 100644 (file)
index 0000000..dfc589b
--- /dev/null
@@ -0,0 +1,1135 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse4_1_double.h"
+#include "kernelutil_x86_sse4_1_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_sse4_1_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_sse4_1_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_10;
+    __m128d           c6grid_20;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_pd();
+        vvdwsum          = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_sub_pd(_mm_mul_pd(vvdw12,one_twelfth),_mm_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 154 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _mm_load_sd(charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_sub_pd(_mm_mul_pd(vvdw12,one_twelfth),_mm_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 154 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*154);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_sse4_1_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_sse4_1_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_10;
+    __m128d           c6grid_20;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 134 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _mm_load_sd(charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_pd(iq0,jq0);
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 134 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*134);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecEw_VdwLJEw_GeomW3W3_sse4_1_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecEw_VdwLJEw_GeomW3W3_sse4_1_double.c
new file mode 100644 (file)
index 0000000..8e3d214
--- /dev/null
@@ -0,0 +1,2237 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse4_1_double.h"
+#include "kernelutil_x86_sse4_1_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_sse4_1_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_sse4_1_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_01;
+    __m128d           c6grid_02;
+    __m128d           c6grid_10;
+    __m128d           c6grid_11;
+    __m128d           c6grid_12;
+    __m128d           c6grid_20;
+    __m128d           c6grid_21;
+    __m128d           c6grid_22;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_pd(charge[inr+0]);
+    jq1              = _mm_set1_pd(charge[inr+1]);
+    jq2              = _mm_set1_pd(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_pd(iq0,jq0);
+    c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_pd(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq01             = _mm_mul_pd(iq0,jq1);
+    qq02             = _mm_mul_pd(iq0,jq2);
+    qq10             = _mm_mul_pd(iq1,jq0);
+    qq11             = _mm_mul_pd(iq1,jq1);
+    qq12             = _mm_mul_pd(iq1,jq2);
+    qq20             = _mm_mul_pd(iq2,jq0);
+    qq21             = _mm_mul_pd(iq2,jq1);
+    qq22             = _mm_mul_pd(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_pd();
+        vvdwsum          = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx01             = _mm_sub_pd(ix0,jx1);
+            dy01             = _mm_sub_pd(iy0,jy1);
+            dz01             = _mm_sub_pd(iz0,jz1);
+            dx02             = _mm_sub_pd(ix0,jx2);
+            dy02             = _mm_sub_pd(iy0,jy2);
+            dz02             = _mm_sub_pd(iz0,jz2);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv01           = gmx_mm_invsqrt_pd(rsq01);
+            rinv02           = gmx_mm_invsqrt_pd(rsq02);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq01         = _mm_mul_pd(rinv01,rinv01);
+            rinvsq02         = _mm_mul_pd(rinv02,rinv02);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_sub_pd(_mm_mul_pd(vvdw12,one_twelfth),_mm_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_pd(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r01,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx01);
+            ty               = _mm_mul_pd(fscal,dy01);
+            tz               = _mm_mul_pd(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_pd(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r02,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx02);
+            ty               = _mm_mul_pd(fscal,dy02);
+            tz               = _mm_mul_pd(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx11);
+            ty               = _mm_mul_pd(fscal,dy11);
+            tz               = _mm_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx12);
+            ty               = _mm_mul_pd(fscal,dy12);
+            tz               = _mm_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx21);
+            ty               = _mm_mul_pd(fscal,dy21);
+            tz               = _mm_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx22);
+            ty               = _mm_mul_pd(fscal,dy22);
+            tz               = _mm_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 397 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx01             = _mm_sub_pd(ix0,jx1);
+            dy01             = _mm_sub_pd(iy0,jy1);
+            dz01             = _mm_sub_pd(iz0,jz1);
+            dx02             = _mm_sub_pd(ix0,jx2);
+            dy02             = _mm_sub_pd(iy0,jy2);
+            dz02             = _mm_sub_pd(iz0,jz2);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv01           = gmx_mm_invsqrt_pd(rsq01);
+            rinv02           = gmx_mm_invsqrt_pd(rsq02);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq01         = _mm_mul_pd(rinv01,rinv01);
+            rinvsq02         = _mm_mul_pd(rinv02,rinv02);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_sub_pd(_mm_mul_pd(vvdw12,one_twelfth),_mm_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_pd(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r01,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx01);
+            ty               = _mm_mul_pd(fscal,dy01);
+            tz               = _mm_mul_pd(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_pd(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r02,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx02);
+            ty               = _mm_mul_pd(fscal,dy02);
+            tz               = _mm_mul_pd(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx11);
+            ty               = _mm_mul_pd(fscal,dy11);
+            tz               = _mm_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx12);
+            ty               = _mm_mul_pd(fscal,dy12);
+            tz               = _mm_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx21);
+            ty               = _mm_mul_pd(fscal,dy21);
+            tz               = _mm_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx22);
+            ty               = _mm_mul_pd(fscal,dy22);
+            tz               = _mm_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 397 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*397);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_sse4_1_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_sse4_1_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_01;
+    __m128d           c6grid_02;
+    __m128d           c6grid_10;
+    __m128d           c6grid_11;
+    __m128d           c6grid_12;
+    __m128d           c6grid_20;
+    __m128d           c6grid_21;
+    __m128d           c6grid_22;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_pd(charge[inr+0]);
+    jq1              = _mm_set1_pd(charge[inr+1]);
+    jq2              = _mm_set1_pd(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_pd(iq0,jq0);
+    c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_pd(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq01             = _mm_mul_pd(iq0,jq1);
+    qq02             = _mm_mul_pd(iq0,jq2);
+    qq10             = _mm_mul_pd(iq1,jq0);
+    qq11             = _mm_mul_pd(iq1,jq1);
+    qq12             = _mm_mul_pd(iq1,jq2);
+    qq20             = _mm_mul_pd(iq2,jq0);
+    qq21             = _mm_mul_pd(iq2,jq1);
+    qq22             = _mm_mul_pd(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx01             = _mm_sub_pd(ix0,jx1);
+            dy01             = _mm_sub_pd(iy0,jy1);
+            dz01             = _mm_sub_pd(iz0,jz1);
+            dx02             = _mm_sub_pd(ix0,jx2);
+            dy02             = _mm_sub_pd(iy0,jy2);
+            dz02             = _mm_sub_pd(iz0,jz2);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv01           = gmx_mm_invsqrt_pd(rsq01);
+            rinv02           = gmx_mm_invsqrt_pd(rsq02);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq01         = _mm_mul_pd(rinv01,rinv01);
+            rinvsq02         = _mm_mul_pd(rinv02,rinv02);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_pd(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r01,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx01);
+            ty               = _mm_mul_pd(fscal,dy01);
+            tz               = _mm_mul_pd(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_pd(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r02,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx02);
+            ty               = _mm_mul_pd(fscal,dy02);
+            tz               = _mm_mul_pd(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx11);
+            ty               = _mm_mul_pd(fscal,dy11);
+            tz               = _mm_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx12);
+            ty               = _mm_mul_pd(fscal,dy12);
+            tz               = _mm_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx21);
+            ty               = _mm_mul_pd(fscal,dy21);
+            tz               = _mm_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx22);
+            ty               = _mm_mul_pd(fscal,dy22);
+            tz               = _mm_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 347 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx01             = _mm_sub_pd(ix0,jx1);
+            dy01             = _mm_sub_pd(iy0,jy1);
+            dz01             = _mm_sub_pd(iz0,jz1);
+            dx02             = _mm_sub_pd(ix0,jx2);
+            dy02             = _mm_sub_pd(iy0,jy2);
+            dz02             = _mm_sub_pd(iz0,jz2);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv01           = gmx_mm_invsqrt_pd(rsq01);
+            rinv02           = gmx_mm_invsqrt_pd(rsq02);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq01         = _mm_mul_pd(rinv01,rinv01);
+            rinvsq02         = _mm_mul_pd(rinv02,rinv02);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r00,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = _mm_add_pd(felec,fvdw);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_pd(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r01,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx01);
+            ty               = _mm_mul_pd(fscal,dy01);
+            tz               = _mm_mul_pd(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_pd(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r02,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx02);
+            ty               = _mm_mul_pd(fscal,dy02);
+            tz               = _mm_mul_pd(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx11);
+            ty               = _mm_mul_pd(fscal,dy11);
+            tz               = _mm_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx12);
+            ty               = _mm_mul_pd(fscal,dy12);
+            tz               = _mm_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx21);
+            ty               = _mm_mul_pd(fscal,dy21);
+            tz               = _mm_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx22);
+            ty               = _mm_mul_pd(fscal,dy22);
+            tz               = _mm_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 347 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*347);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecEw_VdwLJEw_GeomW4P1_sse4_1_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecEw_VdwLJEw_GeomW4P1_sse4_1_double.c
new file mode 100644 (file)
index 0000000..0ca4887
--- /dev/null
@@ -0,0 +1,1277 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse4_1_double.h"
+#include "kernelutil_x86_sse4_1_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_sse4_1_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_sse4_1_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128d          dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_10;
+    __m128d           c6grid_20;
+    __m128d           c6grid_30;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+        fix3             = _mm_setzero_pd();
+        fiy3             = _mm_setzero_pd();
+        fiz3             = _mm_setzero_pd();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_pd();
+        vvdwsum          = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx30             = _mm_sub_pd(ix3,jx0);
+            dy30             = _mm_sub_pd(iy3,jy0);
+            dz30             = _mm_sub_pd(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv30           = gmx_mm_invsqrt_pd(rsq30);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq30         = _mm_mul_pd(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_sub_pd(_mm_mul_pd(vvdw12,one_twelfth),_mm_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_pd(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_pd(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r30,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq30,_mm_sub_pd(rinv30,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx30);
+            ty               = _mm_mul_pd(fscal,dy30);
+            tz               = _mm_mul_pd(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 177 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx30             = _mm_sub_pd(ix3,jx0);
+            dy30             = _mm_sub_pd(iy3,jy0);
+            dz30             = _mm_sub_pd(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv30           = gmx_mm_invsqrt_pd(rsq30);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq30         = _mm_mul_pd(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _mm_load_sd(charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_sub_pd(_mm_mul_pd(vvdw12,one_twelfth),_mm_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_pd(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_pd(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r30,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq30,_mm_sub_pd(rinv30,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx30);
+            ty               = _mm_mul_pd(fscal,dy30);
+            tz               = _mm_mul_pd(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 177 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*177);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_sse4_1_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_sse4_1_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128d          dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_10;
+    __m128d           c6grid_20;
+    __m128d           c6grid_30;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+        fix3             = _mm_setzero_pd();
+        fiy3             = _mm_setzero_pd();
+        fiz3             = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx30             = _mm_sub_pd(ix3,jx0);
+            dy30             = _mm_sub_pd(iy3,jy0);
+            dz30             = _mm_sub_pd(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv30           = gmx_mm_invsqrt_pd(rsq30);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq30         = _mm_mul_pd(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_pd(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_pd(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r30,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx30);
+            ty               = _mm_mul_pd(fscal,dy30);
+            tz               = _mm_mul_pd(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 157 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx10             = _mm_sub_pd(ix1,jx0);
+            dy10             = _mm_sub_pd(iy1,jy0);
+            dz10             = _mm_sub_pd(iz1,jz0);
+            dx20             = _mm_sub_pd(ix2,jx0);
+            dy20             = _mm_sub_pd(iy2,jy0);
+            dz20             = _mm_sub_pd(iz2,jz0);
+            dx30             = _mm_sub_pd(ix3,jx0);
+            dy30             = _mm_sub_pd(iy3,jy0);
+            dz30             = _mm_sub_pd(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv10           = gmx_mm_invsqrt_pd(rsq10);
+            rinv20           = gmx_mm_invsqrt_pd(rsq20);
+            rinv30           = gmx_mm_invsqrt_pd(rsq30);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq10         = _mm_mul_pd(rinv10,rinv10);
+            rinvsq20         = _mm_mul_pd(rinv20,rinv20);
+            rinvsq30         = _mm_mul_pd(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _mm_load_sd(charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_pd(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_pd(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r10,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx10);
+            ty               = _mm_mul_pd(fscal,dy10);
+            tz               = _mm_mul_pd(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_pd(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_pd(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r20,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx20);
+            ty               = _mm_mul_pd(fscal,dy20);
+            tz               = _mm_mul_pd(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_pd(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_pd(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r30,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx30);
+            ty               = _mm_mul_pd(fscal,dy30);
+            tz               = _mm_mul_pd(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 157 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*157);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecEw_VdwLJEw_GeomW4W4_sse4_1_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecEw_VdwLJEw_GeomW4W4_sse4_1_double.c
new file mode 100644 (file)
index 0000000..a08a138
--- /dev/null
@@ -0,0 +1,2391 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse4_1_double.h"
+#include "kernelutil_x86_sse4_1_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_sse4_1_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_sse4_1_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_11;
+    __m128d           c6grid_12;
+    __m128d           c6grid_13;
+    __m128d           c6grid_21;
+    __m128d           c6grid_22;
+    __m128d           c6grid_23;
+    __m128d           c6grid_31;
+    __m128d           c6grid_32;
+    __m128d           c6grid_33;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_pd(charge[inr+1]);
+    jq2              = _mm_set1_pd(charge[inr+2]);
+    jq3              = _mm_set1_pd(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_pd(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq11             = _mm_mul_pd(iq1,jq1);
+    qq12             = _mm_mul_pd(iq1,jq2);
+    qq13             = _mm_mul_pd(iq1,jq3);
+    qq21             = _mm_mul_pd(iq2,jq1);
+    qq22             = _mm_mul_pd(iq2,jq2);
+    qq23             = _mm_mul_pd(iq2,jq3);
+    qq31             = _mm_mul_pd(iq3,jq1);
+    qq32             = _mm_mul_pd(iq3,jq2);
+    qq33             = _mm_mul_pd(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+        fix3             = _mm_setzero_pd();
+        fiy3             = _mm_setzero_pd();
+        fiz3             = _mm_setzero_pd();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_pd();
+        vvdwsum          = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx13             = _mm_sub_pd(ix1,jx3);
+            dy13             = _mm_sub_pd(iy1,jy3);
+            dz13             = _mm_sub_pd(iz1,jz3);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+            dx23             = _mm_sub_pd(ix2,jx3);
+            dy23             = _mm_sub_pd(iy2,jy3);
+            dz23             = _mm_sub_pd(iz2,jz3);
+            dx31             = _mm_sub_pd(ix3,jx1);
+            dy31             = _mm_sub_pd(iy3,jy1);
+            dz31             = _mm_sub_pd(iz3,jz1);
+            dx32             = _mm_sub_pd(ix3,jx2);
+            dy32             = _mm_sub_pd(iy3,jy2);
+            dz32             = _mm_sub_pd(iz3,jz2);
+            dx33             = _mm_sub_pd(ix3,jx3);
+            dy33             = _mm_sub_pd(iy3,jy3);
+            dz33             = _mm_sub_pd(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv13           = gmx_mm_invsqrt_pd(rsq13);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+            rinv23           = gmx_mm_invsqrt_pd(rsq23);
+            rinv31           = gmx_mm_invsqrt_pd(rsq31);
+            rinv32           = gmx_mm_invsqrt_pd(rsq32);
+            rinv33           = gmx_mm_invsqrt_pd(rsq33);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq13         = _mm_mul_pd(rinv13,rinv13);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+            rinvsq23         = _mm_mul_pd(rinv23,rinv23);
+            rinvsq31         = _mm_mul_pd(rinv31,rinv31);
+            rinvsq32         = _mm_mul_pd(rinv32,rinv32);
+            rinvsq33         = _mm_mul_pd(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+            fjx3             = _mm_setzero_pd();
+            fjy3             = _mm_setzero_pd();
+            fjz3             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_sub_pd(_mm_mul_pd(vvdw12,one_twelfth),_mm_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx11);
+            ty               = _mm_mul_pd(fscal,dy11);
+            tz               = _mm_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx12);
+            ty               = _mm_mul_pd(fscal,dy12);
+            tz               = _mm_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_pd(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r13,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx13);
+            ty               = _mm_mul_pd(fscal,dy13);
+            tz               = _mm_mul_pd(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx21);
+            ty               = _mm_mul_pd(fscal,dy21);
+            tz               = _mm_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx22);
+            ty               = _mm_mul_pd(fscal,dy22);
+            tz               = _mm_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_pd(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r23,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx23);
+            ty               = _mm_mul_pd(fscal,dy23);
+            tz               = _mm_mul_pd(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_pd(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r31,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx31);
+            ty               = _mm_mul_pd(fscal,dy31);
+            tz               = _mm_mul_pd(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_pd(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r32,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx32);
+            ty               = _mm_mul_pd(fscal,dy32);
+            tz               = _mm_mul_pd(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_pd(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r33,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx33);
+            ty               = _mm_mul_pd(fscal,dy33);
+            tz               = _mm_mul_pd(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 423 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx13             = _mm_sub_pd(ix1,jx3);
+            dy13             = _mm_sub_pd(iy1,jy3);
+            dz13             = _mm_sub_pd(iz1,jz3);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+            dx23             = _mm_sub_pd(ix2,jx3);
+            dy23             = _mm_sub_pd(iy2,jy3);
+            dz23             = _mm_sub_pd(iz2,jz3);
+            dx31             = _mm_sub_pd(ix3,jx1);
+            dy31             = _mm_sub_pd(iy3,jy1);
+            dz31             = _mm_sub_pd(iz3,jz1);
+            dx32             = _mm_sub_pd(ix3,jx2);
+            dy32             = _mm_sub_pd(iy3,jy2);
+            dz32             = _mm_sub_pd(iz3,jz2);
+            dx33             = _mm_sub_pd(ix3,jx3);
+            dy33             = _mm_sub_pd(iy3,jy3);
+            dz33             = _mm_sub_pd(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv13           = gmx_mm_invsqrt_pd(rsq13);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+            rinv23           = gmx_mm_invsqrt_pd(rsq23);
+            rinv31           = gmx_mm_invsqrt_pd(rsq31);
+            rinv32           = gmx_mm_invsqrt_pd(rsq32);
+            rinv33           = gmx_mm_invsqrt_pd(rsq33);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq13         = _mm_mul_pd(rinv13,rinv13);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+            rinvsq23         = _mm_mul_pd(rinv23,rinv23);
+            rinvsq31         = _mm_mul_pd(rinv31,rinv31);
+            rinvsq32         = _mm_mul_pd(rinv32,rinv32);
+            rinvsq33         = _mm_mul_pd(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+            fjx3             = _mm_setzero_pd();
+            fjy3             = _mm_setzero_pd();
+            fjz3             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_sub_pd(_mm_mul_pd(vvdw12,one_twelfth),_mm_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx11);
+            ty               = _mm_mul_pd(fscal,dy11);
+            tz               = _mm_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx12);
+            ty               = _mm_mul_pd(fscal,dy12);
+            tz               = _mm_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_pd(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r13,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx13);
+            ty               = _mm_mul_pd(fscal,dy13);
+            tz               = _mm_mul_pd(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx21);
+            ty               = _mm_mul_pd(fscal,dy21);
+            tz               = _mm_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx22);
+            ty               = _mm_mul_pd(fscal,dy22);
+            tz               = _mm_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_pd(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r23,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx23);
+            ty               = _mm_mul_pd(fscal,dy23);
+            tz               = _mm_mul_pd(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_pd(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r31,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx31);
+            ty               = _mm_mul_pd(fscal,dy31);
+            tz               = _mm_mul_pd(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_pd(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r32,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx32);
+            ty               = _mm_mul_pd(fscal,dy32);
+            tz               = _mm_mul_pd(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_pd(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r33,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
+            ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
+            ewtabFn          = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
+            felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
+            velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
+            velec            = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
+            velecsum         = _mm_add_pd(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx33);
+            ty               = _mm_mul_pd(fscal,dy33);
+            tz               = _mm_mul_pd(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 423 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*423);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_sse4_1_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_sse4_1_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           c6grid_11;
+    __m128d           c6grid_12;
+    __m128d           c6grid_13;
+    __m128d           c6grid_21;
+    __m128d           c6grid_22;
+    __m128d           c6grid_23;
+    __m128d           c6grid_31;
+    __m128d           c6grid_32;
+    __m128d           c6grid_33;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128i          ewitab;
+    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_pd(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
+    iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
+    iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_pd(charge[inr+1]);
+    jq2              = _mm_set1_pd(charge[inr+2]);
+    jq3              = _mm_set1_pd(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_pd(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq11             = _mm_mul_pd(iq1,jq1);
+    qq12             = _mm_mul_pd(iq1,jq2);
+    qq13             = _mm_mul_pd(iq1,jq3);
+    qq21             = _mm_mul_pd(iq2,jq1);
+    qq22             = _mm_mul_pd(iq2,jq2);
+    qq23             = _mm_mul_pd(iq2,jq3);
+    qq31             = _mm_mul_pd(iq3,jq1);
+    qq32             = _mm_mul_pd(iq3,jq2);
+    qq33             = _mm_mul_pd(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+        fix1             = _mm_setzero_pd();
+        fiy1             = _mm_setzero_pd();
+        fiz1             = _mm_setzero_pd();
+        fix2             = _mm_setzero_pd();
+        fiy2             = _mm_setzero_pd();
+        fiz2             = _mm_setzero_pd();
+        fix3             = _mm_setzero_pd();
+        fiy3             = _mm_setzero_pd();
+        fiz3             = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx13             = _mm_sub_pd(ix1,jx3);
+            dy13             = _mm_sub_pd(iy1,jy3);
+            dz13             = _mm_sub_pd(iz1,jz3);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+            dx23             = _mm_sub_pd(ix2,jx3);
+            dy23             = _mm_sub_pd(iy2,jy3);
+            dz23             = _mm_sub_pd(iz2,jz3);
+            dx31             = _mm_sub_pd(ix3,jx1);
+            dy31             = _mm_sub_pd(iy3,jy1);
+            dz31             = _mm_sub_pd(iz3,jz1);
+            dx32             = _mm_sub_pd(ix3,jx2);
+            dy32             = _mm_sub_pd(iy3,jy2);
+            dz32             = _mm_sub_pd(iz3,jz2);
+            dx33             = _mm_sub_pd(ix3,jx3);
+            dy33             = _mm_sub_pd(iy3,jy3);
+            dz33             = _mm_sub_pd(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv13           = gmx_mm_invsqrt_pd(rsq13);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+            rinv23           = gmx_mm_invsqrt_pd(rsq23);
+            rinv31           = gmx_mm_invsqrt_pd(rsq31);
+            rinv32           = gmx_mm_invsqrt_pd(rsq32);
+            rinv33           = gmx_mm_invsqrt_pd(rsq33);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq13         = _mm_mul_pd(rinv13,rinv13);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+            rinvsq23         = _mm_mul_pd(rinv23,rinv23);
+            rinvsq31         = _mm_mul_pd(rinv31,rinv31);
+            rinvsq32         = _mm_mul_pd(rinv32,rinv32);
+            rinvsq33         = _mm_mul_pd(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+            fjx3             = _mm_setzero_pd();
+            fjy3             = _mm_setzero_pd();
+            fjz3             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx11);
+            ty               = _mm_mul_pd(fscal,dy11);
+            tz               = _mm_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx12);
+            ty               = _mm_mul_pd(fscal,dy12);
+            tz               = _mm_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_pd(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r13,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx13);
+            ty               = _mm_mul_pd(fscal,dy13);
+            tz               = _mm_mul_pd(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx21);
+            ty               = _mm_mul_pd(fscal,dy21);
+            tz               = _mm_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx22);
+            ty               = _mm_mul_pd(fscal,dy22);
+            tz               = _mm_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_pd(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r23,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx23);
+            ty               = _mm_mul_pd(fscal,dy23);
+            tz               = _mm_mul_pd(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_pd(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r31,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx31);
+            ty               = _mm_mul_pd(fscal,dy31);
+            tz               = _mm_mul_pd(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_pd(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r32,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx32);
+            ty               = _mm_mul_pd(fscal,dy32);
+            tz               = _mm_mul_pd(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_pd(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r33,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx33);
+            ty               = _mm_mul_pd(fscal,dy33);
+            tz               = _mm_mul_pd(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 373 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+            dx11             = _mm_sub_pd(ix1,jx1);
+            dy11             = _mm_sub_pd(iy1,jy1);
+            dz11             = _mm_sub_pd(iz1,jz1);
+            dx12             = _mm_sub_pd(ix1,jx2);
+            dy12             = _mm_sub_pd(iy1,jy2);
+            dz12             = _mm_sub_pd(iz1,jz2);
+            dx13             = _mm_sub_pd(ix1,jx3);
+            dy13             = _mm_sub_pd(iy1,jy3);
+            dz13             = _mm_sub_pd(iz1,jz3);
+            dx21             = _mm_sub_pd(ix2,jx1);
+            dy21             = _mm_sub_pd(iy2,jy1);
+            dz21             = _mm_sub_pd(iz2,jz1);
+            dx22             = _mm_sub_pd(ix2,jx2);
+            dy22             = _mm_sub_pd(iy2,jy2);
+            dz22             = _mm_sub_pd(iz2,jz2);
+            dx23             = _mm_sub_pd(ix2,jx3);
+            dy23             = _mm_sub_pd(iy2,jy3);
+            dz23             = _mm_sub_pd(iz2,jz3);
+            dx31             = _mm_sub_pd(ix3,jx1);
+            dy31             = _mm_sub_pd(iy3,jy1);
+            dz31             = _mm_sub_pd(iz3,jz1);
+            dx32             = _mm_sub_pd(ix3,jx2);
+            dy32             = _mm_sub_pd(iy3,jy2);
+            dz32             = _mm_sub_pd(iz3,jz2);
+            dx33             = _mm_sub_pd(ix3,jx3);
+            dy33             = _mm_sub_pd(iy3,jy3);
+            dz33             = _mm_sub_pd(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+            rinv11           = gmx_mm_invsqrt_pd(rsq11);
+            rinv12           = gmx_mm_invsqrt_pd(rsq12);
+            rinv13           = gmx_mm_invsqrt_pd(rsq13);
+            rinv21           = gmx_mm_invsqrt_pd(rsq21);
+            rinv22           = gmx_mm_invsqrt_pd(rsq22);
+            rinv23           = gmx_mm_invsqrt_pd(rsq23);
+            rinv31           = gmx_mm_invsqrt_pd(rsq31);
+            rinv32           = gmx_mm_invsqrt_pd(rsq32);
+            rinv33           = gmx_mm_invsqrt_pd(rsq33);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+            rinvsq11         = _mm_mul_pd(rinv11,rinv11);
+            rinvsq12         = _mm_mul_pd(rinv12,rinv12);
+            rinvsq13         = _mm_mul_pd(rinv13,rinv13);
+            rinvsq21         = _mm_mul_pd(rinv21,rinv21);
+            rinvsq22         = _mm_mul_pd(rinv22,rinv22);
+            rinvsq23         = _mm_mul_pd(rinv23,rinv23);
+            rinvsq31         = _mm_mul_pd(rinv31,rinv31);
+            rinvsq32         = _mm_mul_pd(rinv32,rinv32);
+            rinvsq33         = _mm_mul_pd(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_pd();
+            fjy0             = _mm_setzero_pd();
+            fjz0             = _mm_setzero_pd();
+            fjx1             = _mm_setzero_pd();
+            fjy1             = _mm_setzero_pd();
+            fjz1             = _mm_setzero_pd();
+            fjx2             = _mm_setzero_pd();
+            fjy2             = _mm_setzero_pd();
+            fjz2             = _mm_setzero_pd();
+            fjx3             = _mm_setzero_pd();
+            fjy3             = _mm_setzero_pd();
+            fjz3             = _mm_setzero_pd();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            fjx0             = _mm_add_pd(fjx0,tx);
+            fjy0             = _mm_add_pd(fjy0,ty);
+            fjz0             = _mm_add_pd(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_pd(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r11,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx11);
+            ty               = _mm_mul_pd(fscal,dy11);
+            tz               = _mm_mul_pd(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_pd(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r12,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx12);
+            ty               = _mm_mul_pd(fscal,dy12);
+            tz               = _mm_mul_pd(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_pd(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r13,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx13);
+            ty               = _mm_mul_pd(fscal,dy13);
+            tz               = _mm_mul_pd(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_pd(fix1,tx);
+            fiy1             = _mm_add_pd(fiy1,ty);
+            fiz1             = _mm_add_pd(fiz1,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_pd(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r21,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx21);
+            ty               = _mm_mul_pd(fscal,dy21);
+            tz               = _mm_mul_pd(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_pd(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r22,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx22);
+            ty               = _mm_mul_pd(fscal,dy22);
+            tz               = _mm_mul_pd(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_pd(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r23,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx23);
+            ty               = _mm_mul_pd(fscal,dy23);
+            tz               = _mm_mul_pd(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_pd(fix2,tx);
+            fiy2             = _mm_add_pd(fiy2,ty);
+            fiz2             = _mm_add_pd(fiz2,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_pd(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r31,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx31);
+            ty               = _mm_mul_pd(fscal,dy31);
+            tz               = _mm_mul_pd(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx1             = _mm_add_pd(fjx1,tx);
+            fjy1             = _mm_add_pd(fjy1,ty);
+            fjz1             = _mm_add_pd(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_pd(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r32,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx32);
+            ty               = _mm_mul_pd(fscal,dy32);
+            tz               = _mm_mul_pd(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx2             = _mm_add_pd(fjx2,tx);
+            fjy2             = _mm_add_pd(fjy2,ty);
+            fjz2             = _mm_add_pd(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_pd(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_pd(r33,ewtabscale);
+            ewitab           = _mm_cvttpd_epi32(ewrt);
+            eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
+            felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
+            felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx33);
+            ty               = _mm_mul_pd(fscal,dy33);
+            tz               = _mm_mul_pd(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_pd(fix3,tx);
+            fiy3             = _mm_add_pd(fiy3,ty);
+            fiz3             = _mm_add_pd(fiz3,tz);
+
+            fjx3             = _mm_add_pd(fjx3,tx);
+            fjy3             = _mm_add_pd(fjy3,ty);
+            fjz3             = _mm_add_pd(fjz3,tz);
+
+            gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 373 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*373);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_sse4_1_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_sse4_1_double.c
new file mode 100644 (file)
index 0000000..cb104de
--- /dev/null
@@ -0,0 +1,641 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse4_1_double.h"
+#include "kernelutil_x86_sse4_1_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_sse4_1_double
+ * Electrostatics interaction: None
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_sse4_1_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    rcutoff_scalar   = fr->rvdw;
+    rcutoff          = _mm_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        vvdwsum          = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+           vvdw             = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
+                              _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_add_pd(_mm_mul_pd(c6_00,sh_vdw_invrcut6),_mm_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_pd(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 61 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+           vvdw             = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
+                              _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_add_pd(_mm_mul_pd(c6_00,sh_vdw_invrcut6),_mm_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_pd(vvdw,cutoff_mask);
+            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 61 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*61);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_sse4_1_double
+ * Electrostatics interaction: None
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_sse4_1_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    rcutoff_scalar   = fr->rvdw;
+    rcutoff          = _mm_set1_pd(rcutoff_scalar);
+    rcutoff2         = _mm_mul_pd(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_pd(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_pd(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 49 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_pd(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_pd(fscal,cutoff_mask);
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 49 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 6 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*6 + inneriter*49);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecNone_VdwLJEw_GeomP1P1_sse4_1_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/nb_kernel_ElecNone_VdwLJEw_GeomP1P1_sse4_1_double.c
new file mode 100644 (file)
index 0000000..d536496
--- /dev/null
@@ -0,0 +1,587 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse4_1_double.h"
+#include "kernelutil_x86_sse4_1_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_sse4_1_double
+ * Electrostatics interaction: None
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_sse4_1_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        vvdwsum          = _mm_setzero_pd();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_sub_pd(_mm_mul_pd(vvdw12,one_twelfth),_mm_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,tx,ty,tz);
+
+            /* Inner loop uses 51 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
+            vvdw             = _mm_sub_pd(_mm_mul_pd(vvdw12,one_twelfth),_mm_mul_pd(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
+            vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,tx,ty,tz);
+
+            /* Inner loop uses 51 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*51);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_sse4_1_double
+ * Electrostatics interaction: None
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_sse4_1_double
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
+    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
+    __m128d           c6grid_00;
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    __m128d          dummy_mask,cutoff_mask;
+    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
+    __m128d          one     = _mm_set1_pd(1.0);
+    __m128d          two     = _mm_set1_pd(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm_setzero_pd();
+        fiy0             = _mm_setzero_pd();
+        fiz0             = _mm_setzero_pd();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+            c6grid_00       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,tx,ty,tz);
+
+            /* Inner loop uses 46 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_pd(ix0,jx0);
+            dy00             = _mm_sub_pd(iy0,jy0);
+            dz00             = _mm_sub_pd(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_pd(rsq00);
+
+            rinvsq00         = _mm_mul_pd(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_pd(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq00);
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_pd(fscal,dx00);
+            ty               = _mm_mul_pd(fscal,dy00);
+            tz               = _mm_mul_pd(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_pd(fix0,tx);
+            fiy0             = _mm_add_pd(fiy0,ty);
+            fiz0             = _mm_add_pd(fiz0,tz);
+
+            gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,tx,ty,tz);
+
+            /* Inner loop uses 46 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 6 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*6 + inneriter*46);
+}
index 5fd50d2964ed4d969883acc8331eade28358f0b4..56d5b777fc43c9d44f2a40fdc34295b98f39cbf4 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
 
 #include "../nb_kernel.h"
 
+nb_kernel_t nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_sse4_1_double;
+nb_kernel_t nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_sse4_1_double;
+nb_kernel_t nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_sse4_1_double;
+nb_kernel_t nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_sse4_1_double;
 nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse4_1_double;
 nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse4_1_double;
 nb_kernel_t nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sse4_1_double;
@@ -48,6 +52,16 @@ nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sse4_1_double;
 nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sse4_1_double;
 nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sse4_1_double;
 nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sse4_1_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_sse4_1_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_sse4_1_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_sse4_1_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_sse4_1_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_sse4_1_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_sse4_1_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_sse4_1_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_sse4_1_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_sse4_1_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_sse4_1_double;
 nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sse4_1_double;
 nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sse4_1_double;
 nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse4_1_double;
@@ -78,6 +92,16 @@ nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse4_1_double;
 nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse4_1_double;
 nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse4_1_double;
 nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse4_1_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_sse4_1_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_sse4_1_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_sse4_1_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_sse4_1_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_sse4_1_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_sse4_1_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_sse4_1_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_sse4_1_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_sse4_1_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_sse4_1_double;
 nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sse4_1_double;
 nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sse4_1_double;
 nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse4_1_double;
@@ -259,6 +283,10 @@ nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sse4_1_double;
 nb_kernel_info_t
     kernellist_sse4_1_double[] =
 {
+    { nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_sse4_1_double, "nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_sse4_1_double", "sse4_1_double", "None", "None", "LJEwald", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_sse4_1_double, "nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_sse4_1_double", "sse4_1_double", "None", "None", "LJEwald", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_sse4_1_double, "nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_sse4_1_double", "sse4_1_double", "None", "None", "LJEwald", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_sse4_1_double, "nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_sse4_1_double", "sse4_1_double", "None", "None", "LJEwald", "PotentialShift", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse4_1_double, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse4_1_double", "sse4_1_double", "None", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse4_1_double, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse4_1_double", "sse4_1_double", "None", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sse4_1_double, "nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sse4_1_double", "sse4_1_double", "None", "None", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
@@ -267,6 +295,16 @@ nb_kernel_info_t
     { nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sse4_1_double, "nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sse4_1_double", "sse4_1_double", "None", "None", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sse4_1_double, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sse4_1_double", "sse4_1_double", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sse4_1_double, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sse4_1_double", "sse4_1_double", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_sse4_1_double, "nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_sse4_1_double", "sse4_1_double", "Ewald", "None", "LJEwald", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_sse4_1_double, "nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_sse4_1_double", "sse4_1_double", "Ewald", "None", "LJEwald", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_sse4_1_double, "nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_sse4_1_double", "sse4_1_double", "Ewald", "None", "LJEwald", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_sse4_1_double, "nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_sse4_1_double", "sse4_1_double", "Ewald", "None", "LJEwald", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_sse4_1_double, "nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_sse4_1_double", "sse4_1_double", "Ewald", "None", "LJEwald", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_sse4_1_double, "nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_sse4_1_double", "sse4_1_double", "Ewald", "None", "LJEwald", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_sse4_1_double, "nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_sse4_1_double", "sse4_1_double", "Ewald", "None", "LJEwald", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_sse4_1_double, "nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_sse4_1_double", "sse4_1_double", "Ewald", "None", "LJEwald", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_sse4_1_double, "nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_sse4_1_double", "sse4_1_double", "Ewald", "None", "LJEwald", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_sse4_1_double, "nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_sse4_1_double", "sse4_1_double", "Ewald", "None", "LJEwald", "None", "Water4Water4", "", "Force" },
     { nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sse4_1_double, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sse4_1_double", "sse4_1_double", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sse4_1_double, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sse4_1_double", "sse4_1_double", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse4_1_double, "nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse4_1_double", "sse4_1_double", "Ewald", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
@@ -297,6 +335,16 @@ nb_kernel_info_t
     { nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse4_1_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse4_1_double", "sse4_1_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
     { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse4_1_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse4_1_double", "sse4_1_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
     { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse4_1_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse4_1_double", "sse4_1_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_sse4_1_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_sse4_1_double", "sse4_1_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_sse4_1_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_sse4_1_double", "sse4_1_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_sse4_1_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_sse4_1_double", "sse4_1_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_sse4_1_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_sse4_1_double", "sse4_1_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_sse4_1_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_sse4_1_double", "sse4_1_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_sse4_1_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_sse4_1_double", "sse4_1_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_sse4_1_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_sse4_1_double", "sse4_1_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_sse4_1_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_sse4_1_double", "sse4_1_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_sse4_1_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_sse4_1_double", "sse4_1_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_sse4_1_double, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_sse4_1_double", "sse4_1_double", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water4Water4", "", "Force" },
     { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sse4_1_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sse4_1_double", "sse4_1_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sse4_1_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sse4_1_double", "sse4_1_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse4_1_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse4_1_double", "sse4_1_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
index 359850660e402f1605a317578c1130a3627a99f3..c313e80fedd6bf54f378a61aef4f9c8aacdee5f3 100644 (file)
@@ -2,7 +2,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -151,6 +151,15 @@ void
     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
     real             *vftab;
     /* #endif */
+    /* #if 'LJEwald' in KERNEL_VDW */
+    /* #for I,J in PAIRS_IJ */
+    __m128d           c6grid_{I}{J};
+    /* #endfor */
+    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128d           one_half = _mm_set1_pd(0.5);
+    __m128d           minus_one = _mm_set1_pd(-1.0);
+    /* #endif */
     /* #if 'Ewald' in KERNEL_ELEC */
     __m128i          ewitab;
     __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
@@ -189,6 +198,12 @@ void
     vdwparam         = fr->nbfp;
     vdwtype          = mdatoms->typeA;
     /* #endif */
+    /* #if 'LJEwald' in KERNEL_VDW */
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald      = _mm_set1_pd(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_pd(fr->ewaldcoeff_lj);
+    ewclj2           = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
+    /* #endif */
 
     /* #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW */
     vftab            = kernel_data->table_elec_vdw->data;
@@ -245,8 +260,14 @@ void
     qq{I}{J}             = _mm_mul_pd(iq{I},jq{J});
     /*         #endif */
     /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
+    /*             #if 'LJEwald' in KERNEL_VDW */
     c6_{I}{J}            = _mm_set1_pd(vdwparam[vdwioffset{I}+vdwjidx{J}A]);
     c12_{I}{J}           = _mm_set1_pd(vdwparam[vdwioffset{I}+vdwjidx{J}A+1]);
+    c6grid_{I}{J}        = _mm_set1_pd(vdwgridparam[vdwioffset{I}+vdwjidx{J}A]);
+    /*             #else */
+    c6_{I}{J}            = _mm_set1_pd(vdwparam[vdwioffset{I}+vdwjidx{J}A]);
+    c12_{I}{J}           = _mm_set1_pd(vdwparam[vdwioffset{I}+vdwjidx{J}A+1]);
+    /*             #endif */
     /*         #endif */
     /*     #endfor */
     /* #endif */
@@ -528,8 +549,16 @@ void
             /*             #if ROUND == 'Loop' */
             gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset{I}+vdwjidx{J}A,
                                          vdwparam+vdwioffset{I}+vdwjidx{J}B,&c6_{I}{J},&c12_{I}{J});
+            /*                 #if 'LJEwald' in KERNEL_VDW */
+            c6grid_{I}{J}       = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset{I}+vdwjidx{J}A,
+                                                               vdwgridparam+vdwioffset{I}+vdwjidx{J}B);
+            /*                 #endif */
             /*             #else */
             gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset{I}+vdwjidx{J}A,&c6_{I}{J},&c12_{I}{J});
+
+            /*                 #if 'LJEwald' in KERNEL_VDW */
+            c6grid_{I}{J}       = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset{I}+vdwjidx{J}A);
+            /*                 #endif */
             /*             #endif */
             /*         #endif */
             /*     #endif */
@@ -823,6 +852,46 @@ void
             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv{I}{J})));
             /*                 #define INNERFLOPS INNERFLOPS+4 */
             /*             #endif */
+
+
+           /*         #elif KERNEL_VDW=='LJEwald' */
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq{I}{J},rinvsq{I}{J}),rinvsq{I}{J});
+            ewcljrsq         = _mm_mul_pd(ewclj2,rsq{I}{J});
+            ewclj6           = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_d(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
+            /*                 #define INNERFLOPS INNERFLOPS+11 */
+            /*             #if 'Potential' in KERNEL_VF or KERNEL_MOD_VDW=='PotentialSwitch'*/
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_pd(_mm_sub_pd(c6_{I}{J},_mm_mul_pd(c6grid_{I}{J},_mm_sub_pd(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_pd(c12_{I}{J},_mm_mul_pd(rinvsix,rinvsix));
+           /*                 #define INNERFLOPS INNERFLOPS+6 */
+            /*                 #if KERNEL_MOD_VDW=='PotentialShift' */
+           vvdw             = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_{I}{J},_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
+                              _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_add_pd(_mm_mul_pd(c6_{I}{J},sh_vdw_invrcut6),_mm_mul_pd(c6grid_{I}{J},sh_lj_ewald))),one_sixth));
+           /*                 #define INNERFLOPS INNERFLOPS+9 */
+            /*                 #else */
+            vvdw             = _mm_sub_pd(_mm_mul_pd(vvdw12,one_twelfth),_mm_mul_pd(vvdw6,one_sixth));
+            /*                 #define INNERFLOPS INNERFLOPS+3 */
+           /*                 #endif */
+            /*                  ## Check for force inside potential check, i.e. this means we already did the potential part */
+            /*                  #if 'Force' in KERNEL_VF */
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_{I}{J},one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq{I}{J});
+            /*                 #define INNERFLOPS INNERFLOPS+6 */
+            /*                  #endif */
+            /*              #elif KERNEL_VF=='Force' */
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_pd(c6grid_{I}{J},_mm_sub_pd(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_pd(_mm_mul_pd(c6grid_{I}{J},one_sixth),_mm_mul_pd(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_{I}{J},rinvsix),_mm_sub_pd(c6_{I}{J},f6A)),rinvsix),f6B),rinvsq{I}{J});
+            /*                 #define INNERFLOPS INNERFLOPS+11 */
+            /*              #endif */
             /*         #endif */
             /*         ## End of check for vdw interaction forms */
             /*     #endif */
index bd75e6c428e223e613a1bb9c822a5d063d905e7d..450d40e9f8bba956743cc9c1f7443c4a12d5d229 100755 (executable)
@@ -2,7 +2,7 @@
 #
 # This file is part of the GROMACS molecular simulation package.
 #
-# Copyright (c) 2012,2013, by the GROMACS development team, led by
+# Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
 # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 # and including many others, as listed in the AUTHORS file in the
 # top-level source directory and at http://www.gromacs.org.
@@ -120,6 +120,7 @@ VdwList = {
     'LennardJones'            : ['rinvsq'],
 #    'Buckingham'              : ['rinv','rinvsq','r'], # Disabled for sse4.1 to reduce number of kernels and simply the template
     'CubicSplineTable'        : ['rinv','r','table'],
+    'LJEwald'                 : ['rinv','rinvsq','r'],
 }
 
 
@@ -193,6 +194,7 @@ Abbreviation = {
     'CubicSplineTable'        : 'CSTab',
     'LennardJones'            : 'LJ',
     'Buckingham'              : 'Bham',
+    'LJEwald'                 : 'LJEw',
     'PotentialShift'          : 'Sh',
     'PotentialSwitch'         : 'Sw',
     'ExactCutoff'             : 'Cut',
@@ -282,7 +284,11 @@ def KeepKernel(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,Kernel
         return 0
     # For Vdw, we support switch and shift for Lennard-Jones/Buckingham
     if((KernelVdwMod=='ExactCutoff') or
-       (KernelVdwMod in ['PotentialShift','PotentialSwitch'] and KernelVdw not in ['LennardJones','Buckingham'])):
+       (KernelVdwMod in ['PotentialShift','PotentialSwitch'] and KernelVdw not in ['LennardJones','Buckingham','LJEwald'])):
+        return 0
+
+    # For LJEwald, we only support shift
+    if(KernelVdw=='LJEwald' and KernelVdwMod=='PotentialSwitch'):
         return 0
 
     # Choose either switch or shift and don't mix them...
@@ -302,6 +308,10 @@ def KeepKernel(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,Kernel
         elif(KernelElec!='ReactionField'):
             return 0
 
+    #Only do LJ-PME if we are also doing PME for electrostatics, or no electrostatics at all.
+    if(KernelVdw=='LJEwald' and KernelElec not in ['Ewald','None']):
+        return 0
+
     return 1
 
 
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_sse4_1_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_sse4_1_single.c
new file mode 100644 (file)
index 0000000..8e94d9c
--- /dev/null
@@ -0,0 +1,864 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse4_1_single.h"
+#include "kernelutil_x86_sse4_1_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_sse4_1_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_sse4_1_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half  = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2          = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+           vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
+                              _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_add_ps(_mm_mul_ps(c6_00,sh_vdw_invrcut6),_mm_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 82 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+           vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
+                              _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_add_ps(_mm_mul_ps(c6_00,sh_vdw_invrcut6),_mm_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 83 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*83);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_sse4_1_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_sse4_1_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half  = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2          = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 62 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 63 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*63);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_sse4_1_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_sse4_1_single.c
new file mode 100644 (file)
index 0000000..0aaae28
--- /dev/null
@@ -0,0 +1,1402 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse4_1_single.h"
+#include "kernelutil_x86_sse4_1_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_sse4_1_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_sse4_1_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_10;
+    __m128           c6grid_20;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half  = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2          = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+           vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
+                              _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_add_ps(_mm_mul_ps(c6_00,sh_vdw_invrcut6),_mm_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 174 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+           vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
+                              _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_add_ps(_mm_mul_ps(c6_00,sh_vdw_invrcut6),_mm_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 177 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*177);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_sse4_1_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_sse4_1_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_10;
+    __m128           c6grid_20;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half  = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2          = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 140 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 143 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*143);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_sse4_1_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_sse4_1_single.c
new file mode 100644 (file)
index 0000000..31ed9d2
--- /dev/null
@@ -0,0 +1,2714 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse4_1_single.h"
+#include "kernelutil_x86_sse4_1_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_sse4_1_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_sse4_1_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_01;
+    __m128           c6grid_02;
+    __m128           c6grid_10;
+    __m128           c6grid_11;
+    __m128           c6grid_12;
+    __m128           c6grid_20;
+    __m128           c6grid_21;
+    __m128           c6grid_22;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half  = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2          = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_ps(iq0,jq0);
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+           vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
+                              _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_add_ps(_mm_mul_ps(c6_00,sh_vdw_invrcut6),_mm_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_sub_ps(rinv01,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_sub_ps(rinv02,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 450 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+           vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
+                              _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_add_ps(_mm_mul_ps(c6_00,sh_vdw_invrcut6),_mm_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+            r01              = _mm_andnot_ps(dummy_mask,r01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_sub_ps(rinv01,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+            r02              = _mm_andnot_ps(dummy_mask,r02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_sub_ps(rinv02,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 459 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*459);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_sse4_1_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_sse4_1_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_01;
+    __m128           c6grid_02;
+    __m128           c6grid_10;
+    __m128           c6grid_11;
+    __m128           c6grid_12;
+    __m128           c6grid_20;
+    __m128           c6grid_21;
+    __m128           c6grid_22;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half  = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2          = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_ps(iq0,jq0);
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 374 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq01,rcutoff2))
+            {
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+            r01              = _mm_andnot_ps(dummy_mask,r01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq02,rcutoff2))
+            {
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+            r02              = _mm_andnot_ps(dummy_mask,r02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 383 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*383);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_sse4_1_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_sse4_1_single.c
new file mode 100644 (file)
index 0000000..74230d5
--- /dev/null
@@ -0,0 +1,1582 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse4_1_single.h"
+#include "kernelutil_x86_sse4_1_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_sse4_1_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_sse4_1_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_10;
+    __m128           c6grid_20;
+    __m128           c6grid_30;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half  = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2          = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+           vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
+                              _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_add_ps(_mm_mul_ps(c6_00,sh_vdw_invrcut6),_mm_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_sub_ps(rinv30,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 200 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+           vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
+                              _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_add_ps(_mm_mul_ps(c6_00,sh_vdw_invrcut6),_mm_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+            r30              = _mm_andnot_ps(dummy_mask,r30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq30,_mm_sub_ps(_mm_sub_ps(rinv30,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 204 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*204);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_sse4_1_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_sse4_1_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_10;
+    __m128           c6grid_20;
+    __m128           c6grid_30;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half  = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2          = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 166 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq10,rcutoff2))
+            {
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq20,rcutoff2))
+            {
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq30,rcutoff2))
+            {
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+            r30              = _mm_andnot_ps(dummy_mask,r30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 170 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*170);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_sse4_1_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_sse4_1_single.c
new file mode 100644 (file)
index 0000000..15806e7
--- /dev/null
@@ -0,0 +1,2910 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse4_1_single.h"
+#include "kernelutil_x86_sse4_1_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_sse4_1_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_sse4_1_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_11;
+    __m128           c6grid_12;
+    __m128           c6grid_13;
+    __m128           c6grid_21;
+    __m128           c6grid_22;
+    __m128           c6grid_23;
+    __m128           c6grid_31;
+    __m128           c6grid_32;
+    __m128           c6grid_33;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half  = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2          = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+           vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
+                              _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_add_ps(_mm_mul_ps(c6_00,sh_vdw_invrcut6),_mm_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_sub_ps(rinv13,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_sub_ps(rinv23,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_sub_ps(rinv31,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_sub_ps(rinv32,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_sub_ps(rinv33,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 479 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+           vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
+                              _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_add_ps(_mm_mul_ps(c6_00,sh_vdw_invrcut6),_mm_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+            r13              = _mm_andnot_ps(dummy_mask,r13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq13,_mm_sub_ps(_mm_sub_ps(rinv13,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+            r23              = _mm_andnot_ps(dummy_mask,r23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq23,_mm_sub_ps(_mm_sub_ps(rinv23,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+            r31              = _mm_andnot_ps(dummy_mask,r31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq31,_mm_sub_ps(_mm_sub_ps(rinv31,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+            r32              = _mm_andnot_ps(dummy_mask,r32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq32,_mm_sub_ps(_mm_sub_ps(rinv32,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+            r33              = _mm_andnot_ps(dummy_mask,r33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq33,_mm_sub_ps(_mm_sub_ps(rinv33,sh_ewald),velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_and_ps(velec,cutoff_mask);
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 489 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*489);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_sse4_1_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_sse4_1_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_11;
+    __m128           c6grid_12;
+    __m128           c6grid_13;
+    __m128           c6grid_21;
+    __m128           c6grid_22;
+    __m128           c6grid_23;
+    __m128           c6grid_31;
+    __m128           c6grid_32;
+    __m128           c6grid_33;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half  = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2          = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 403 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq11,rcutoff2))
+            {
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq12,rcutoff2))
+            {
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq13,rcutoff2))
+            {
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+            r13              = _mm_andnot_ps(dummy_mask,r13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq21,rcutoff2))
+            {
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq22,rcutoff2))
+            {
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq23,rcutoff2))
+            {
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+            r23              = _mm_andnot_ps(dummy_mask,r23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq31,rcutoff2))
+            {
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+            r31              = _mm_andnot_ps(dummy_mask,r31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq32,rcutoff2))
+            {
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+            r32              = _mm_andnot_ps(dummy_mask,r32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq33,rcutoff2))
+            {
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+            r33              = _mm_andnot_ps(dummy_mask,r33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            }
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 413 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*413);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEw_VdwLJEw_GeomP1P1_sse4_1_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEw_VdwLJEw_GeomP1P1_sse4_1_single.c
new file mode 100644 (file)
index 0000000..8aff4f1
--- /dev/null
@@ -0,0 +1,806 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse4_1_single.h"
+#include "kernelutil_x86_sse4_1_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_sse4_1_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_sse4_1_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half  = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2          = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+            /* Inner loop uses 69 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+            /* Inner loop uses 70 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*70);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_sse4_1_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_sse4_1_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half  = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2          = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        iq0              = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+            /* Inner loop uses 59 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+            /* Inner loop uses 60 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*60);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEw_VdwLJEw_GeomW3P1_sse4_1_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEw_VdwLJEw_GeomW3P1_sse4_1_single.c
new file mode 100644 (file)
index 0000000..7f718d3
--- /dev/null
@@ -0,0 +1,1268 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse4_1_single.h"
+#include "kernelutil_x86_sse4_1_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_sse4_1_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_sse4_1_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_10;
+    __m128           c6grid_20;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half  = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2          = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 151 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 154 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*154);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_sse4_1_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_sse4_1_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_10;
+    __m128           c6grid_20;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half  = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2          = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 131 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _mm_mul_ps(iq0,jq0);
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 134 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*134);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEw_VdwLJEw_GeomW3W3_sse4_1_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEw_VdwLJEw_GeomW3W3_sse4_1_single.c
new file mode 100644 (file)
index 0000000..1d9b2b3
--- /dev/null
@@ -0,0 +1,2352 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse4_1_single.h"
+#include "kernelutil_x86_sse4_1_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_sse4_1_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_sse4_1_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_01;
+    __m128           c6grid_02;
+    __m128           c6grid_10;
+    __m128           c6grid_11;
+    __m128           c6grid_12;
+    __m128           c6grid_20;
+    __m128           c6grid_21;
+    __m128           c6grid_22;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half  = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2          = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_ps(iq0,jq0);
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 397 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+            r01              = _mm_andnot_ps(dummy_mask,r01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+            r02              = _mm_andnot_ps(dummy_mask,r02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 406 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*406);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_sse4_1_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_sse4_1_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_01;
+    __m128           c6grid_02;
+    __m128           c6grid_10;
+    __m128           c6grid_11;
+    __m128           c6grid_12;
+    __m128           c6grid_20;
+    __m128           c6grid_21;
+    __m128           c6grid_22;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half  = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2          = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = _mm_set1_ps(charge[inr+0]);
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _mm_mul_ps(iq0,jq0);
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq01             = _mm_mul_ps(iq0,jq1);
+    qq02             = _mm_mul_ps(iq0,jq2);
+    qq10             = _mm_mul_ps(iq1,jq0);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq20             = _mm_mul_ps(iq2,jq0);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 347 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx01             = _mm_sub_ps(ix0,jx1);
+            dy01             = _mm_sub_ps(iy0,jy1);
+            dz01             = _mm_sub_ps(iz0,jz1);
+            dx02             = _mm_sub_ps(ix0,jx2);
+            dy02             = _mm_sub_ps(iy0,jy2);
+            dz02             = _mm_sub_ps(iz0,jz2);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
+            rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv01           = gmx_mm_invsqrt_ps(rsq01);
+            rinv02           = gmx_mm_invsqrt_ps(rsq02);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq01         = _mm_mul_ps(rinv01,rinv01);
+            rinvsq02         = _mm_mul_ps(rinv02,rinv02);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r00,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = _mm_add_ps(felec,fvdw);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _mm_mul_ps(rsq01,rinv01);
+            r01              = _mm_andnot_ps(dummy_mask,r01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r01,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx01);
+            ty               = _mm_mul_ps(fscal,dy01);
+            tz               = _mm_mul_ps(fscal,dz01);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _mm_mul_ps(rsq02,rinv02);
+            r02              = _mm_andnot_ps(dummy_mask,r02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r02,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx02);
+            ty               = _mm_mul_ps(fscal,dy02);
+            tz               = _mm_mul_ps(fscal,dz02);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 356 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*356);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEw_VdwLJEw_GeomW4P1_sse4_1_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEw_VdwLJEw_GeomW4P1_sse4_1_single.c
new file mode 100644 (file)
index 0000000..c160cde
--- /dev/null
@@ -0,0 +1,1412 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse4_1_single.h"
+#include "kernelutil_x86_sse4_1_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_sse4_1_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_sse4_1_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_10;
+    __m128           c6grid_20;
+    __m128           c6grid_30;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half  = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2          = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 174 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+            r30              = _mm_andnot_ps(dummy_mask,r30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 178 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*178);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_sse4_1_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_sse4_1_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    __m128           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_10;
+    __m128           c6grid_20;
+    __m128           c6grid_30;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half  = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2          = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 154 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx10             = _mm_sub_ps(ix1,jx0);
+            dy10             = _mm_sub_ps(iy1,jy0);
+            dz10             = _mm_sub_ps(iz1,jz0);
+            dx20             = _mm_sub_ps(ix2,jx0);
+            dy20             = _mm_sub_ps(iy2,jy0);
+            dz20             = _mm_sub_ps(iz2,jz0);
+            dx30             = _mm_sub_ps(ix3,jx0);
+            dy30             = _mm_sub_ps(iy3,jy0);
+            dz30             = _mm_sub_ps(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
+            rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
+            rsq30            = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv10           = gmx_mm_invsqrt_ps(rsq10);
+            rinv20           = gmx_mm_invsqrt_ps(rsq20);
+            rinv30           = gmx_mm_invsqrt_ps(rsq30);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq10         = _mm_mul_ps(rinv10,rinv10);
+            rinvsq20         = _mm_mul_ps(rinv20,rinv20);
+            rinvsq30         = _mm_mul_ps(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
+                                                              charge+jnrC+0,charge+jnrD+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _mm_mul_ps(rsq10,rinv10);
+            r10              = _mm_andnot_ps(dummy_mask,r10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _mm_mul_ps(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r10,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx10);
+            ty               = _mm_mul_ps(fscal,dy10);
+            tz               = _mm_mul_ps(fscal,dz10);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _mm_mul_ps(rsq20,rinv20);
+            r20              = _mm_andnot_ps(dummy_mask,r20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _mm_mul_ps(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r20,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx20);
+            ty               = _mm_mul_ps(fscal,dy20);
+            tz               = _mm_mul_ps(fscal,dz20);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _mm_mul_ps(rsq30,rinv30);
+            r30              = _mm_andnot_ps(dummy_mask,r30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _mm_mul_ps(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r30,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx30);
+            ty               = _mm_mul_ps(fscal,dy30);
+            tz               = _mm_mul_ps(fscal,dz30);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 158 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*158);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEw_VdwLJEw_GeomW4W4_sse4_1_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEw_VdwLJEw_GeomW4W4_sse4_1_single.c
new file mode 100644 (file)
index 0000000..8ba0d49
--- /dev/null
@@ -0,0 +1,2512 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse4_1_single.h"
+#include "kernelutil_x86_sse4_1_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_sse4_1_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_sse4_1_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_11;
+    __m128           c6grid_12;
+    __m128           c6grid_13;
+    __m128           c6grid_21;
+    __m128           c6grid_22;
+    __m128           c6grid_23;
+    __m128           c6grid_31;
+    __m128           c6grid_32;
+    __m128           c6grid_33;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half  = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2          = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Reset potential sums */
+        velecsum         = _mm_setzero_ps();
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 423 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+            r13              = _mm_andnot_ps(dummy_mask,r13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+            r23              = _mm_andnot_ps(dummy_mask,r23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+            r31              = _mm_andnot_ps(dummy_mask,r31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+            r32              = _mm_andnot_ps(dummy_mask,r32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+            r33              = _mm_andnot_ps(dummy_mask,r33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            ewitab           = _mm_slli_epi32(ewitab,2);
+            ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
+            ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
+            ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
+            ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
+            _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
+            felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
+            velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
+            velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _mm_andnot_ps(dummy_mask,velec);
+            velecsum         = _mm_add_ps(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 433 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*433);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_sse4_1_single
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LJEwald
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_sse4_1_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
+    __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
+    __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
+    __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           c6grid_11;
+    __m128           c6grid_12;
+    __m128           c6grid_13;
+    __m128           c6grid_21;
+    __m128           c6grid_22;
+    __m128           c6grid_23;
+    __m128           c6grid_31;
+    __m128           c6grid_32;
+    __m128           c6grid_33;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half  = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128i          ewitab;
+    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = _mm_set1_ps(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2          = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
+    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
+    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
+    iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = _mm_set1_ps(charge[inr+1]);
+    jq2              = _mm_set1_ps(charge[inr+2]);
+    jq3              = _mm_set1_ps(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    c6grid_00        = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
+    qq11             = _mm_mul_ps(iq1,jq1);
+    qq12             = _mm_mul_ps(iq1,jq2);
+    qq13             = _mm_mul_ps(iq1,jq3);
+    qq21             = _mm_mul_ps(iq2,jq1);
+    qq22             = _mm_mul_ps(iq2,jq2);
+    qq23             = _mm_mul_ps(iq2,jq3);
+    qq31             = _mm_mul_ps(iq3,jq1);
+    qq32             = _mm_mul_ps(iq3,jq2);
+    qq33             = _mm_mul_ps(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+        fix1             = _mm_setzero_ps();
+        fiy1             = _mm_setzero_ps();
+        fiz1             = _mm_setzero_ps();
+        fix2             = _mm_setzero_ps();
+        fiy2             = _mm_setzero_ps();
+        fiz2             = _mm_setzero_ps();
+        fix3             = _mm_setzero_ps();
+        fiy3             = _mm_setzero_ps();
+        fiz3             = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            fscal            = felec;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 373 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+            dx11             = _mm_sub_ps(ix1,jx1);
+            dy11             = _mm_sub_ps(iy1,jy1);
+            dz11             = _mm_sub_ps(iz1,jz1);
+            dx12             = _mm_sub_ps(ix1,jx2);
+            dy12             = _mm_sub_ps(iy1,jy2);
+            dz12             = _mm_sub_ps(iz1,jz2);
+            dx13             = _mm_sub_ps(ix1,jx3);
+            dy13             = _mm_sub_ps(iy1,jy3);
+            dz13             = _mm_sub_ps(iz1,jz3);
+            dx21             = _mm_sub_ps(ix2,jx1);
+            dy21             = _mm_sub_ps(iy2,jy1);
+            dz21             = _mm_sub_ps(iz2,jz1);
+            dx22             = _mm_sub_ps(ix2,jx2);
+            dy22             = _mm_sub_ps(iy2,jy2);
+            dz22             = _mm_sub_ps(iz2,jz2);
+            dx23             = _mm_sub_ps(ix2,jx3);
+            dy23             = _mm_sub_ps(iy2,jy3);
+            dz23             = _mm_sub_ps(iz2,jz3);
+            dx31             = _mm_sub_ps(ix3,jx1);
+            dy31             = _mm_sub_ps(iy3,jy1);
+            dz31             = _mm_sub_ps(iz3,jz1);
+            dx32             = _mm_sub_ps(ix3,jx2);
+            dy32             = _mm_sub_ps(iy3,jy2);
+            dz32             = _mm_sub_ps(iz3,jz2);
+            dx33             = _mm_sub_ps(ix3,jx3);
+            dy33             = _mm_sub_ps(iy3,jy3);
+            dz33             = _mm_sub_ps(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+            rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
+            rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
+            rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
+            rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
+            rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
+            rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
+            rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
+            rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
+            rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+            rinv11           = gmx_mm_invsqrt_ps(rsq11);
+            rinv12           = gmx_mm_invsqrt_ps(rsq12);
+            rinv13           = gmx_mm_invsqrt_ps(rsq13);
+            rinv21           = gmx_mm_invsqrt_ps(rsq21);
+            rinv22           = gmx_mm_invsqrt_ps(rsq22);
+            rinv23           = gmx_mm_invsqrt_ps(rsq23);
+            rinv31           = gmx_mm_invsqrt_ps(rsq31);
+            rinv32           = gmx_mm_invsqrt_ps(rsq32);
+            rinv33           = gmx_mm_invsqrt_ps(rsq33);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+            rinvsq11         = _mm_mul_ps(rinv11,rinv11);
+            rinvsq12         = _mm_mul_ps(rinv12,rinv12);
+            rinvsq13         = _mm_mul_ps(rinv13,rinv13);
+            rinvsq21         = _mm_mul_ps(rinv21,rinv21);
+            rinvsq22         = _mm_mul_ps(rinv22,rinv22);
+            rinvsq23         = _mm_mul_ps(rinv23,rinv23);
+            rinvsq31         = _mm_mul_ps(rinv31,rinv31);
+            rinvsq32         = _mm_mul_ps(rinv32,rinv32);
+            rinvsq33         = _mm_mul_ps(rinv33,rinv33);
+
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+            fjx1             = _mm_setzero_ps();
+            fjy1             = _mm_setzero_ps();
+            fjz1             = _mm_setzero_ps();
+            fjx2             = _mm_setzero_ps();
+            fjy2             = _mm_setzero_ps();
+            fjz2             = _mm_setzero_ps();
+            fjx3             = _mm_setzero_ps();
+            fjy3             = _mm_setzero_ps();
+            fjz3             = _mm_setzero_ps();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _mm_mul_ps(rsq11,rinv11);
+            r11              = _mm_andnot_ps(dummy_mask,r11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r11,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx11);
+            ty               = _mm_mul_ps(fscal,dy11);
+            tz               = _mm_mul_ps(fscal,dz11);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _mm_mul_ps(rsq12,rinv12);
+            r12              = _mm_andnot_ps(dummy_mask,r12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r12,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx12);
+            ty               = _mm_mul_ps(fscal,dy12);
+            tz               = _mm_mul_ps(fscal,dz12);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _mm_mul_ps(rsq13,rinv13);
+            r13              = _mm_andnot_ps(dummy_mask,r13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r13,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx13);
+            ty               = _mm_mul_ps(fscal,dy13);
+            tz               = _mm_mul_ps(fscal,dz13);
+
+            /* Update vectorial force */
+            fix1             = _mm_add_ps(fix1,tx);
+            fiy1             = _mm_add_ps(fiy1,ty);
+            fiz1             = _mm_add_ps(fiz1,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _mm_mul_ps(rsq21,rinv21);
+            r21              = _mm_andnot_ps(dummy_mask,r21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r21,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx21);
+            ty               = _mm_mul_ps(fscal,dy21);
+            tz               = _mm_mul_ps(fscal,dz21);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _mm_mul_ps(rsq22,rinv22);
+            r22              = _mm_andnot_ps(dummy_mask,r22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r22,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx22);
+            ty               = _mm_mul_ps(fscal,dy22);
+            tz               = _mm_mul_ps(fscal,dz22);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _mm_mul_ps(rsq23,rinv23);
+            r23              = _mm_andnot_ps(dummy_mask,r23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r23,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx23);
+            ty               = _mm_mul_ps(fscal,dy23);
+            tz               = _mm_mul_ps(fscal,dz23);
+
+            /* Update vectorial force */
+            fix2             = _mm_add_ps(fix2,tx);
+            fiy2             = _mm_add_ps(fiy2,ty);
+            fiz2             = _mm_add_ps(fiz2,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _mm_mul_ps(rsq31,rinv31);
+            r31              = _mm_andnot_ps(dummy_mask,r31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r31,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx31);
+            ty               = _mm_mul_ps(fscal,dy31);
+            tz               = _mm_mul_ps(fscal,dz31);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx1             = _mm_add_ps(fjx1,tx);
+            fjy1             = _mm_add_ps(fjy1,ty);
+            fjz1             = _mm_add_ps(fjz1,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _mm_mul_ps(rsq32,rinv32);
+            r32              = _mm_andnot_ps(dummy_mask,r32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r32,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx32);
+            ty               = _mm_mul_ps(fscal,dy32);
+            tz               = _mm_mul_ps(fscal,dz32);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx2             = _mm_add_ps(fjx2,tx);
+            fjy2             = _mm_add_ps(fjy2,ty);
+            fjz2             = _mm_add_ps(fjz2,tz);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _mm_mul_ps(rsq33,rinv33);
+            r33              = _mm_andnot_ps(dummy_mask,r33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _mm_mul_ps(r33,ewtabscale);
+            ewitab           = _mm_cvttps_epi32(ewrt);
+            eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
+            gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
+                                         ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
+                                         &ewtabF,&ewtabFn);
+            felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
+            felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
+
+            fscal            = felec;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx33);
+            ty               = _mm_mul_ps(fscal,dy33);
+            tz               = _mm_mul_ps(fscal,dz33);
+
+            /* Update vectorial force */
+            fix3             = _mm_add_ps(fix3,tx);
+            fiy3             = _mm_add_ps(fiy3,ty);
+            fiz3             = _mm_add_ps(fiz3,tz);
+
+            fjx3             = _mm_add_ps(fjx3,tx);
+            fjy3             = _mm_add_ps(fjy3,ty);
+            fjz3             = _mm_add_ps(fjz3,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
+                                                   fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
+                                                   fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 383 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*383);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_sse4_1_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_sse4_1_single.c
new file mode 100644 (file)
index 0000000..5fea300
--- /dev/null
@@ -0,0 +1,759 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse4_1_single.h"
+#include "kernelutil_x86_sse4_1_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_sse4_1_single
+ * Electrostatics interaction: None
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_sse4_1_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half  = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2          = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    rcutoff_scalar   = fr->rvdw;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+           vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
+                              _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_add_ps(_mm_mul_ps(c6_00,sh_vdw_invrcut6),_mm_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 62 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+           vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
+                              _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_add_ps(_mm_mul_ps(c6_00,sh_vdw_invrcut6),_mm_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 63 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*63);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_sse4_1_single
+ * Electrostatics interaction: None
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_sse4_1_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half  = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2          = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    rcutoff_scalar   = fr->rvdw;
+    rcutoff          = _mm_set1_ps(rcutoff_scalar);
+    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
+    rvdw             = _mm_set1_ps(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 49 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+            }
+
+            /* Inner loop uses 50 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 6 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*6 + inneriter*50);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecNone_VdwLJEw_GeomP1P1_sse4_1_single.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecNone_VdwLJEw_GeomP1P1_sse4_1_single.c
new file mode 100644 (file)
index 0000000..43f1be9
--- /dev/null
@@ -0,0 +1,705 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "gromacs/simd/math_x86_sse4_1_single.h"
+#include "kernelutil_x86_sse4_1_single.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_sse4_1_single
+ * Electrostatics interaction: None
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_sse4_1_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half  = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2          = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        vvdwsum          = _mm_setzero_ps();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+            /* Inner loop uses 51 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
+            vvdw             = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
+            vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+            /* Inner loop uses 52 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*52);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_sse4_1_single
+ * Electrostatics interaction: None
+ * VdW interaction:            LJEwald
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_sse4_1_single
+                    (t_nblist                    * gmx_restrict       nlist,
+                     rvec                        * gmx_restrict          xx,
+                     rvec                        * gmx_restrict          ff,
+                     t_forcerec                  * gmx_restrict          fr,
+                     t_mdatoms                   * gmx_restrict     mdatoms,
+                     nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
+                     t_nrnb                      * gmx_restrict        nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
+     * just 0 for non-waters.
+     * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
+    int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
+    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
+    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
+    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
+    __m128           c6grid_00;
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half  = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    __m128           dummy_mask,cutoff_mask;
+    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
+    __m128           one     = _mm_set1_ps(1.0);
+    __m128           two     = _mm_set1_ps(2.0);
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2          = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+    j_coord_offsetC = 0;
+    j_coord_offsetD = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _mm_setzero_ps();
+        fiy0             = _mm_setzero_ps();
+        fiz0             = _mm_setzero_ps();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            jnrC             = jjnr[jidx+2];
+            jnrD             = jjnr[jidx+3];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+            /* Inner loop uses 46 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
+            /* Sign of each element will be negative for non-real atoms.
+             * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
+             * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
+             */
+            dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+            j_coord_offsetC  = DIM*jnrC;
+            j_coord_offsetD  = DIM*jnrD;
+
+            /* load j atom coordinates */
+            gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              x+j_coord_offsetC,x+j_coord_offsetD,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _mm_sub_ps(ix0,jx0);
+            dy00             = _mm_sub_ps(iy0,jy0);
+            dz00             = _mm_sub_ps(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
+
+            rinv00           = gmx_mm_invsqrt_ps(rsq00);
+
+            rinvsq00         = _mm_mul_ps(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+            vdwjidx0C        = 2*vdwtype[jnrC+0];
+            vdwjidx0D        = 2*vdwtype[jnrD+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _mm_mul_ps(rsq00,rinv00);
+            r00              = _mm_andnot_ps(dummy_mask,r00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,
+                                         vdwparam+vdwioffset0+vdwjidx0C,
+                                         vdwparam+vdwioffset0+vdwjidx0D,
+                                         &c6_00,&c12_00);
+
+            c6grid_00       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0B,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0C,
+                                                               vdwgridparam+vdwioffset0+vdwjidx0D);
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
+
+            fscal            = fvdw;
+
+            fscal            = _mm_andnot_ps(dummy_mask,fscal);
+
+            /* Calculate temporary vectorial force */
+            tx               = _mm_mul_ps(fscal,dx00);
+            ty               = _mm_mul_ps(fscal,dy00);
+            tz               = _mm_mul_ps(fscal,dz00);
+
+            /* Update vectorial force */
+            fix0             = _mm_add_ps(fix0,tx);
+            fiy0             = _mm_add_ps(fiy0,ty);
+            fiz0             = _mm_add_ps(fiz0,tz);
+
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+            /* Inner loop uses 47 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 6 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*6 + inneriter*47);
+}
index b61da3ddcb49ae3b1aca8d57d499b7399fe2e158..811107d564bd277abd218edbc448167cecf6728a 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
 
 #include "../nb_kernel.h"
 
+nb_kernel_t nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_sse4_1_single;
+nb_kernel_t nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_sse4_1_single;
+nb_kernel_t nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_sse4_1_single;
+nb_kernel_t nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_sse4_1_single;
 nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse4_1_single;
 nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse4_1_single;
 nb_kernel_t nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sse4_1_single;
@@ -48,6 +52,16 @@ nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sse4_1_single;
 nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sse4_1_single;
 nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sse4_1_single;
 nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sse4_1_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_sse4_1_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_sse4_1_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_sse4_1_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_sse4_1_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_sse4_1_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_sse4_1_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_sse4_1_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_sse4_1_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_sse4_1_single;
+nb_kernel_t nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_sse4_1_single;
 nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sse4_1_single;
 nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sse4_1_single;
 nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse4_1_single;
@@ -78,6 +92,16 @@ nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse4_1_single;
 nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse4_1_single;
 nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse4_1_single;
 nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse4_1_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_sse4_1_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_sse4_1_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_sse4_1_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_sse4_1_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_sse4_1_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_sse4_1_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_sse4_1_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_sse4_1_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_sse4_1_single;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_sse4_1_single;
 nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sse4_1_single;
 nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sse4_1_single;
 nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse4_1_single;
@@ -259,6 +283,10 @@ nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sse4_1_single;
 nb_kernel_info_t
     kernellist_sse4_1_single[] =
 {
+    { nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_sse4_1_single, "nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_sse4_1_single", "sse4_1_single", "None", "None", "LJEwald", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_sse4_1_single, "nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_sse4_1_single", "sse4_1_single", "None", "None", "LJEwald", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_sse4_1_single, "nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_sse4_1_single", "sse4_1_single", "None", "None", "LJEwald", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_sse4_1_single, "nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_sse4_1_single", "sse4_1_single", "None", "None", "LJEwald", "PotentialShift", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse4_1_single, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse4_1_single", "sse4_1_single", "None", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse4_1_single, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse4_1_single", "sse4_1_single", "None", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sse4_1_single, "nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sse4_1_single", "sse4_1_single", "None", "None", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
@@ -267,6 +295,16 @@ nb_kernel_info_t
     { nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sse4_1_single, "nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sse4_1_single", "sse4_1_single", "None", "None", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sse4_1_single, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sse4_1_single", "sse4_1_single", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sse4_1_single, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sse4_1_single", "sse4_1_single", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_sse4_1_single, "nb_kernel_ElecEw_VdwLJEw_GeomP1P1_VF_sse4_1_single", "sse4_1_single", "Ewald", "None", "LJEwald", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_sse4_1_single, "nb_kernel_ElecEw_VdwLJEw_GeomP1P1_F_sse4_1_single", "sse4_1_single", "Ewald", "None", "LJEwald", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_sse4_1_single, "nb_kernel_ElecEw_VdwLJEw_GeomW3P1_VF_sse4_1_single", "sse4_1_single", "Ewald", "None", "LJEwald", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_sse4_1_single, "nb_kernel_ElecEw_VdwLJEw_GeomW3P1_F_sse4_1_single", "sse4_1_single", "Ewald", "None", "LJEwald", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_sse4_1_single, "nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_sse4_1_single", "sse4_1_single", "Ewald", "None", "LJEwald", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_sse4_1_single, "nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_sse4_1_single", "sse4_1_single", "Ewald", "None", "LJEwald", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_sse4_1_single, "nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_sse4_1_single", "sse4_1_single", "Ewald", "None", "LJEwald", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_sse4_1_single, "nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_sse4_1_single", "sse4_1_single", "Ewald", "None", "LJEwald", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_sse4_1_single, "nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_sse4_1_single", "sse4_1_single", "Ewald", "None", "LJEwald", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_sse4_1_single, "nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_sse4_1_single", "sse4_1_single", "Ewald", "None", "LJEwald", "None", "Water4Water4", "", "Force" },
     { nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sse4_1_single, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sse4_1_single", "sse4_1_single", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sse4_1_single, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sse4_1_single", "sse4_1_single", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse4_1_single, "nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse4_1_single", "sse4_1_single", "Ewald", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
@@ -297,6 +335,16 @@ nb_kernel_info_t
     { nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse4_1_single, "nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse4_1_single", "sse4_1_single", "Ewald", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
     { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse4_1_single, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse4_1_single", "sse4_1_single", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
     { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse4_1_single, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse4_1_single", "sse4_1_single", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_sse4_1_single, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_VF_sse4_1_single", "sse4_1_single", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_sse4_1_single, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_F_sse4_1_single", "sse4_1_single", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_sse4_1_single, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_VF_sse4_1_single", "sse4_1_single", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_sse4_1_single, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3P1_F_sse4_1_single", "sse4_1_single", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_sse4_1_single, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_sse4_1_single", "sse4_1_single", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_sse4_1_single, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_sse4_1_single", "sse4_1_single", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_sse4_1_single, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_sse4_1_single", "sse4_1_single", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_sse4_1_single, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_sse4_1_single", "sse4_1_single", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_sse4_1_single, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_sse4_1_single", "sse4_1_single", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_sse4_1_single, "nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_sse4_1_single", "sse4_1_single", "Ewald", "PotentialShift", "LJEwald", "PotentialShift", "Water4Water4", "", "Force" },
     { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sse4_1_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sse4_1_single", "sse4_1_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
     { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sse4_1_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sse4_1_single", "sse4_1_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
     { nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse4_1_single, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse4_1_single", "sse4_1_single", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
index e3beb992b1a7f38fbcdd7ba2f6d1929289cdb0b4..669739d531f191a7c4a2cd1ce05c92d81a5f79cf 100644 (file)
@@ -2,7 +2,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -154,6 +154,15 @@ void
     __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
     real             *vftab;
     /* #endif */
+    /* #if 'LJEwald' in KERNEL_VDW */
+    /* #for I,J in PAIRS_IJ */
+    __m128           c6grid_{I}{J};
+    /* #endfor */
+    __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
+    real             *vdwgridparam;
+    __m128           one_half  = _mm_set1_ps(0.5);
+    __m128           minus_one = _mm_set1_ps(-1.0);
+    /* #endif */
     /* #if 'Ewald' in KERNEL_ELEC */
     __m128i          ewitab;
     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
@@ -192,6 +201,12 @@ void
     vdwparam         = fr->nbfp;
     vdwtype          = mdatoms->typeA;
     /* #endif */
+    /* #if 'LJEwald' in KERNEL_VDW */
+    vdwgridparam     = fr->ljpme_c6grid;
+    sh_lj_ewald             = _mm_set1_ps(fr->ic->sh_lj_ewald);
+    ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
+    ewclj2          = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
+    /* #endif */
 
     /* #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW */
     vftab            = kernel_data->table_elec_vdw->data;
@@ -248,8 +263,14 @@ void
     qq{I}{J}             = _mm_mul_ps(iq{I},jq{J});
     /*         #endif */
     /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
+    /*             #if 'LJEwald' in KERNEL_VDW */
     c6_{I}{J}            = _mm_set1_ps(vdwparam[vdwioffset{I}+vdwjidx{J}A]);
     c12_{I}{J}           = _mm_set1_ps(vdwparam[vdwioffset{I}+vdwjidx{J}A+1]);
+    c6grid_{I}{J}        = _mm_set1_ps(vdwgridparam[vdwioffset{I}+vdwjidx{J}A]);
+    /*             #else */
+    c6_{I}{J}            = _mm_set1_ps(vdwparam[vdwioffset{I}+vdwjidx{J}A]);
+    c12_{I}{J}           = _mm_set1_ps(vdwparam[vdwioffset{I}+vdwjidx{J}A+1]);
+    /*             #endif */
     /*         #endif */
     /*     #endfor */
     /* #endif */
@@ -540,6 +561,13 @@ void
                                          vdwparam+vdwioffset{I}+vdwjidx{J}C,
                                          vdwparam+vdwioffset{I}+vdwjidx{J}D,
                                          &c6_{I}{J},&c12_{I}{J});
+
+            /*         #if 'LJEwald' in KERNEL_VDW */
+            c6grid_{I}{J}       = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset{I}+vdwjidx{J}A,
+                                                               vdwgridparam+vdwioffset{I}+vdwjidx{J}B,
+                                                               vdwgridparam+vdwioffset{I}+vdwjidx{J}C,
+                                                               vdwgridparam+vdwioffset{I}+vdwjidx{J}D);
+            /*          #endif */
             /*         #endif */
             /*     #endif */
 
@@ -791,6 +819,45 @@ void
             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv{I}{J})));
             /*                 #define INNERFLOPS INNERFLOPS+4 */
             /*             #endif */
+
+            /*         #elif KERNEL_VDW=='LJEwald' */
+
+            /* Analytical LJ-PME */
+            rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq{I}{J},rinvsq{I}{J}),rinvsq{I}{J});
+            ewcljrsq         = _mm_mul_ps(ewclj2,rsq{I}{J});
+            ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
+            exponent         = gmx_simd_exp_r(ewcljrsq);
+            /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
+            poly             = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
+            /*             #define INNERFLOPS INNERFLOPS+11 */
+            /*             #if 'Potential' in KERNEL_VF or KERNEL_MOD_VDW=='PotentialSwitch' */
+            /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
+            vvdw6            = _mm_mul_ps(_mm_sub_ps(c6_{I}{J},_mm_mul_ps(c6grid_{I}{J},_mm_sub_ps(one,poly))),rinvsix);
+            vvdw12           = _mm_mul_ps(c12_{I}{J},_mm_mul_ps(rinvsix,rinvsix));
+           /*                 #define INNERFLOPS INNERFLOPS+6 */
+           /*                 #if KERNEL_MOD_VDW=='PotentialShift' */
+           vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_{I}{J},_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
+                              _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_add_ps(_mm_mul_ps(c6_{I}{J},sh_vdw_invrcut6),_mm_mul_ps(c6grid_{I}{J},sh_lj_ewald))),one_sixth));
+           /*                 #define INNERFLOPS INNERFLOPS+10 */
+           /*                 #else */
+            vvdw             = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
+            /*                 #define INNERFLOPS INNERFLOPS+3 */
+           /*                 #endif */
+            /*                 ## Check for force inside potential check, i.e. this means we already did the potential part */
+            /*                 #if 'Force' in KERNEL_VF */
+            /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
+            fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_{I}{J},one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq{I}{J});
+            /*                 #define INNERFLOPS INNERFLOPS+6 */
+            /*                 #endif */
+            /*             #elif KERNEL_VF=='Force' */
+            /* f6A = 6 * C6grid * (1 - poly) */
+            f6A              = _mm_mul_ps(c6grid_{I}{J},_mm_sub_ps(one,poly));
+            /* f6B = C6grid * exponent * beta^6 */
+            f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_{I}{J},one_sixth),_mm_mul_ps(exponent,ewclj6));
+            /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
+           fvdw              = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_{I}{J},rinvsix),_mm_sub_ps(c6_{I}{J},f6A)),rinvsix),f6B),rinvsq{I}{J});
+            /*                 #define INNERFLOPS INNERFLOPS+11 */
+            /*             #endif */
             /*         #endif */
             /*         ## End of check for vdw interaction forms */
             /*     #endif */
index 14cbadd1bc9afddf902ab571f07b6124a86087e6..ff641f450efd1ea5dac8f957c005cec8135e9755 100644 (file)
@@ -1225,6 +1225,18 @@ void check_ir(const char *mdparin, t_inputrec *ir, t_gromppopts *opts,
         }
     }
 
+    if (ir->vdwtype == evdwPME)
+    {
+        if (!(ir->vdw_modifier == eintmodNONE || ir->vdw_modifier == eintmodPOTSHIFT))
+        {
+            sprintf(err_buf, "With vdwtype = %s, the only supported modifiers are %s a\
+nd %s",
+                    evdw_names[ir->vdwtype],
+                    eintmod_names[eintmodPOTSHIFT],
+                    eintmod_names[eintmodNONE]);
+        }
+    }
+
     if (ir->cutoff_scheme == ecutsGROUP)
     {
         if (((ir->coulomb_modifier != eintmodNONE && ir->rcoulomb == ir->rlist) ||
index bb479d1bbbf22efa26f929fda094435370ad9ee1..d8eaab27f694f310ae2b4f479af9e08ead8a91c2 100644 (file)
@@ -429,6 +429,7 @@ enum gmx_nbkernel_vdw
     GMX_NBKERNEL_VDW_LENNARDJONES,
     GMX_NBKERNEL_VDW_BUCKINGHAM,
     GMX_NBKERNEL_VDW_CUBICSPLINETABLE,
+    GMX_NBKERNEL_VDW_LJEWALD,
     GMX_NBKERNEL_VDW_NR
 };
 /* Types of interactions inside the neighborlist
index aa0a976ae4ed87343869c8fe08159721a5f26e0c..4adcbf0d2a0b0e0426b95033b297beb4ee14be84 100644 (file)
@@ -374,6 +374,7 @@ typedef struct {
     int      ntype; /* Number of atom types */
     gmx_bool bBHAM;
     real    *nbfp;
+    real    *ljpme_c6grid; /* C6-values used on grid in LJPME */
 
     /* Energy group pair flags */
     int *egp_flags;
index bd22a259fa1a7ae23a10b9ac52deeb2fd1f29541..4c46b4804e461f09b6290242636a27ac77dd3c52 100644 (file)
@@ -158,6 +158,46 @@ static real *mk_nbfp(const gmx_ffparams_t *idef, gmx_bool bBHAM)
     return nbfp;
 }
 
+static real *make_ljpme_c6grid(const gmx_ffparams_t *idef, t_forcerec *fr)
+{
+    int   i, j, k, atnr;
+    real  c6, c6i, c6j, c12i, c12j, epsi, epsj, sigmai, sigmaj;
+    real *grid;
+
+    /* For LJ-PME simulations, we correct the energies with the reciprocal space
+     * inside of the cut-off. To do this the non-bonded kernels needs to have
+     * access to the C6-values used on the reciprocal grid in pme.c
+     */
+
+    atnr = idef->atnr;
+    snew(grid, 2*atnr*atnr);
+    for (i = k = 0; (i < atnr); i++)
+    {
+        for (j = 0; (j < atnr); j++, k++)
+        {
+            c6i  = idef->iparams[i*(atnr+1)].lj.c6;
+            c12i = idef->iparams[i*(atnr+1)].lj.c12;
+            c6j  = idef->iparams[j*(atnr+1)].lj.c6;
+            c12j = idef->iparams[j*(atnr+1)].lj.c12;
+            c6   = sqrt(c6i * c6j);
+            if (fr->ljpme_combination_rule == eljpmeLB
+                && !gmx_numzero(c6) && !gmx_numzero(c12i) && !gmx_numzero(c12j))
+            {
+                sigmai = pow(c12i / c6i, 1.0/6.0);
+                sigmaj = pow(c12j / c6j, 1.0/6.0);
+                epsi   = c6i * c6i / c12i;
+                epsj   = c6j * c6j / c12j;
+                c6     = sqrt(epsi * epsj) * pow(0.5*(sigmai+sigmaj), 6);
+            }
+            /* Store the elements at the same relative positions as C6 in nbfp in order
+             * to simplify access in the kernels
+             */
+            grid[2*(atnr*i+j)] = c6*6.0;
+        }
+    }
+    return grid;
+}
+
 static real *mk_nbfp_combination_rule(const gmx_ffparams_t *idef, int comb_rule)
 {
     real *nbfp;
@@ -1955,10 +1995,6 @@ init_interaction_const(FILE                       *fp,
             {
                 real crc2;
 
-                if (fr->cutoff_scheme == ecutsGROUP)
-                {
-                    gmx_fatal(FARGS, "Potential-shift is not (yet) implemented for LJ-PME with cutoff-scheme=group");
-                }
                 crc2            = sqr(ic->ewaldcoeff_lj*ic->rvdw);
                 ic->sh_lj_ewald = (exp(-crc2)*(1 + crc2 + 0.5*crc2*crc2) - 1)*pow(ic->rvdw, -6.0);
             }
@@ -2582,7 +2618,6 @@ void init_forcerec(FILE              *fp,
     switch (fr->vdwtype)
     {
         case evdwCUT:
-        case evdwPME:
             if (fr->bBHAM)
             {
                 fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_BUCKINGHAM;
@@ -2592,6 +2627,9 @@ void init_forcerec(FILE              *fp,
                 fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_LENNARDJONES;
             }
             break;
+        case evdwPME:
+            fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_LJEWALD;
+            break;
 
         case evdwSWITCH:
         case evdwSHIFT:
@@ -2616,8 +2654,8 @@ void init_forcerec(FILE              *fp,
 
     if (ir->cutoff_scheme == ecutsGROUP)
     {
-        fr->bvdwtab    = (fr->vdwtype != evdwCUT ||
-                          !gmx_within_tol(fr->reppow, 12.0, 10*GMX_DOUBLE_EPS));
+        fr->bvdwtab    = ((fr->vdwtype != evdwCUT || !gmx_within_tol(fr->reppow, 12.0, 10*GMX_DOUBLE_EPS))
+                          && !EVDW_PME(fr->vdwtype));
         /* We have special kernels for standard Ewald and PME, but the pme-switch ones are tabulated above */
         fr->bcoultab   = !(fr->eeltype == eelCUT ||
                            fr->eeltype == eelEWALD ||
@@ -2776,6 +2814,10 @@ void init_forcerec(FILE              *fp,
     {
         fr->ntype = mtop->ffparams.atnr;
         fr->nbfp  = mk_nbfp(&mtop->ffparams, fr->bBHAM);
+        if (EVDW_PME(fr->vdwtype))
+        {
+            fr->ljpme_c6grid  = make_ljpme_c6grid(&mtop->ffparams, fr);
+        }
     }
 
     /* Copy the energy group exclusions */